Muhammed Sameer commited on
Commit
ea9ca44
·
0 Parent(s):

Initial commit - Iris Full (under development)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +24 -0
  2. README.md +12 -0
  3. Supabase/.temp/cli-latest +1 -0
  4. Supabase/config.toml +66 -0
  5. Supabase/functions/_shared/cors.ts +4 -0
  6. Supabase/functions/generate-verification-token/.npmrc +3 -0
  7. Supabase/functions/generate-verification-token/deno.json +3 -0
  8. Supabase/functions/generate-verification-token/index.ts +60 -0
  9. Supabase/functions/initiate-admin-transfer/.npmrc +3 -0
  10. Supabase/functions/initiate-admin-transfer/deno.json +3 -0
  11. Supabase/functions/initiate-admin-transfer/index.ts +80 -0
  12. Supabase/functions/invite-first-admin/.npmrc +3 -0
  13. Supabase/functions/invite-first-admin/deno.json +3 -0
  14. Supabase/functions/invite-first-admin/index.ts +24 -0
  15. Supabase/functions/otp/.npmrc +3 -0
  16. Supabase/functions/otp/deno.json +3 -0
  17. Supabase/functions/otp/index.ts +137 -0
  18. Supabase/functions/send-interview-email/.npmrc +3 -0
  19. Supabase/functions/send-interview-email/deno.json +3 -0
  20. Supabase/functions/send-interview-email/index.ts +51 -0
  21. Supabase/functions/verify-domain/.npmrc +3 -0
  22. Supabase/functions/verify-domain/deno.json +3 -0
  23. Supabase/functions/verify-domain/index.ts +32 -0
  24. backend/.env +12 -0
  25. backend/.gitignore +26 -0
  26. backend/add_experience_to_embeddings.sql +5 -0
  27. backend/add_projects_to_profiles.sql +5 -0
  28. backend/api.py +157 -0
  29. backend/create_profile_embeddings.sql +32 -0
  30. backend/debug_payload.json +69 -0
  31. backend/debug_resume.txt +6 -0
  32. backend/requirements.txt +28 -0
  33. backend/src/__init__.py +0 -0
  34. backend/src/embeddings/__init__.py +0 -0
  35. backend/src/embeddings/debug_embedding_storage.py +62 -0
  36. backend/src/embeddings/job_embed.py +108 -0
  37. backend/src/embeddings/local_embedder.py +137 -0
  38. backend/src/embeddings/process_all_profiles.py +46 -0
  39. backend/src/embeddings/test_embedder.py +34 -0
  40. backend/src/extraction/__init__.py +0 -0
  41. backend/src/extraction/fallback_extractor.py +51 -0
  42. backend/src/extraction/job_extractor.py +181 -0
  43. backend/src/extraction/person_details_extraction_gemini.py +220 -0
  44. backend/src/extraction/test_regex.py +33 -0
  45. backend/src/ingestion/__init__.py +0 -0
  46. backend/src/ingestion/docx_reader.py +11 -0
  47. backend/src/ingestion/parser.py +21 -0
  48. backend/src/ingestion/pdf_reader.py +38 -0
  49. backend/src/matching/__init__.py +0 -0
  50. backend/src/matching/similarity.py +0 -0
.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+
15
+ # Editor directories and files
16
+ .vscode/*
17
+ !.vscode/extensions.json
18
+ .idea
19
+ .DS_Store
20
+ *.suo
21
+ *.ntvs*
22
+ *.njsproj
23
+ *.sln
24
+ *.sw?
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # React + Vite
2
+
3
+ This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
4
+
5
+ Currently, two official plugins are available:
6
+
7
+ - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) for Fast Refresh
8
+ - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
9
+
10
+ ## Expanding the ESLint configuration
11
+
12
+ If you are developing a production application, we recommend using TypeScript with type-aware lint rules enabled. Check out the [TS template](https://github.com/vitejs/vite/tree/main/packages/create-vite/template-react-ts) for information on how to integrate TypeScript and [`typescript-eslint`](https://typescript-eslint.io) in your project.
Supabase/.temp/cli-latest ADDED
@@ -0,0 +1 @@
 
 
1
+ v2.67.1
Supabase/config.toml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ [functions.verify-domain]
3
+ enabled = true
4
+ verify_jwt = true
5
+ import_map = "./functions/verify-domain/deno.json"
6
+ # Uncomment to specify a custom file path to the entrypoint.
7
+ # Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
8
+ entrypoint = "./functions/verify-domain/index.ts"
9
+ # Specifies static files to be bundled with the function. Supports glob patterns.
10
+ # For example, if you want to serve static HTML pages in your function:
11
+ # static_files = [ "./functions/verify-domain/*.html" ]
12
+
13
+ [functions.invite-first-admin]
14
+ enabled = true
15
+ verify_jwt = true
16
+ import_map = "./functions/invite-first-admin/deno.json"
17
+ # Uncomment to specify a custom file path to the entrypoint.
18
+ # Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
19
+ entrypoint = "./functions/invite-first-admin/index.ts"
20
+ # Specifies static files to be bundled with the function. Supports glob patterns.
21
+ # For example, if you want to serve static HTML pages in your function:
22
+ # static_files = [ "./functions/invite-first-admin/*.html" ]
23
+
24
+ [functions.generate-verification-token]
25
+ enabled = true
26
+ verify_jwt = true
27
+ import_map = "./functions/generate-verification-token/deno.json"
28
+ # Uncomment to specify a custom file path to the entrypoint.
29
+ # Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
30
+ entrypoint = "./functions/generate-verification-token/index.ts"
31
+ # Specifies static files to be bundled with the function. Supports glob patterns.
32
+ # For example, if you want to serve static HTML pages in your function:
33
+ # static_files = [ "./functions/generate-verification-token/*.html" ]
34
+
35
+ [functions.initiate-admin-transfer]
36
+ enabled = true
37
+ verify_jwt = true
38
+ import_map = "./functions/initiate-admin-transfer/deno.json"
39
+ # Uncomment to specify a custom file path to the entrypoint.
40
+ # Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
41
+ entrypoint = "./functions/initiate-admin-transfer/index.ts"
42
+ # Specifies static files to be bundled with the function. Supports glob patterns.
43
+ # For example, if you want to serve static HTML pages in your function:
44
+ # static_files = [ "./functions/initiate-admin-transfer/*.html" ]
45
+
46
+ [functions.otp]
47
+ enabled = true
48
+ verify_jwt = true
49
+ import_map = "./functions/otp/deno.json"
50
+ # Uncomment to specify a custom file path to the entrypoint.
51
+ # Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
52
+ entrypoint = "./functions/otp/index.ts"
53
+ # Specifies static files to be bundled with the function. Supports glob patterns.
54
+ # For example, if you want to serve static HTML pages in your function:
55
+ # static_files = [ "./functions/otp/*.html" ]
56
+
57
+ [functions.send-interview-email]
58
+ enabled = true
59
+ verify_jwt = true
60
+ import_map = "./functions/send-interview-email/deno.json"
61
+ # Uncomment to specify a custom file path to the entrypoint.
62
+ # Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
63
+ entrypoint = "./functions/send-interview-email/index.ts"
64
+ # Specifies static files to be bundled with the function. Supports glob patterns.
65
+ # For example, if you want to serve static HTML pages in your function:
66
+ # static_files = [ "./functions/send-interview-email/*.html" ]
Supabase/functions/_shared/cors.ts ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ export const corsHeaders = {
2
+ 'Access-Control-Allow-Origin': '*',
3
+ 'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
4
+ }
Supabase/functions/generate-verification-token/.npmrc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Configuration for private npm package dependencies
2
+ # For more information on using private registries with Edge Functions, see:
3
+ # https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
Supabase/functions/generate-verification-token/deno.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "imports": {}
3
+ }
Supabase/functions/generate-verification-token/index.ts ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // supabase/functions/generate-verification-token/index.ts
2
+
3
+ import { serve } from 'https://deno.land/std@0.177.0/http/server.ts'
4
+ import { createClient } from 'https://esm.sh/@supabase/supabase-js@2.39.8'
5
+ import { corsHeaders } from '../_shared/cors.ts'
6
+
7
+ serve(async (req) => {
8
+ // This is the crucial block that handles the browser's preflight check
9
+ if (req.method === 'OPTIONS') {
10
+ return new Response('ok', { headers: corsHeaders })
11
+ }
12
+
13
+ try {
14
+ const { email } = await req.json();
15
+ const domain = email.split('@')[1];
16
+ if (!domain) {
17
+ throw new Error("Invalid email format.");
18
+ }
19
+
20
+ const supabaseAdmin = createClient(
21
+ Deno.env.get('SUPABASE_URL')!,
22
+ Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!
23
+ );
24
+
25
+ const { data: blockedDomain } = await supabaseAdmin
26
+ .from('blocked_domains')
27
+ .select('domain')
28
+ .eq('domain', domain)
29
+ .single();
30
+
31
+ if (blockedDomain) {
32
+ throw new Error("Please use a business email. Free email providers are not allowed.");
33
+ }
34
+
35
+ const { data, error } = await supabaseAdmin
36
+ .from('organizations')
37
+ .insert({
38
+ name: domain,
39
+ verified_domain: domain,
40
+ })
41
+ .select('verification_token')
42
+ .single();
43
+
44
+ if (error) throw error;
45
+
46
+ return new Response(JSON.stringify({
47
+ verification_token: data.verification_token,
48
+ domain: domain
49
+ }), {
50
+ headers: { ...corsHeaders, 'Content-Type': 'application/json' },
51
+ status: 200,
52
+ });
53
+
54
+ } catch (error) {
55
+ return new Response(JSON.stringify({ error: error.message }), {
56
+ headers: { ...corsHeaders, 'Content-Type': 'application/json' },
57
+ status: 400,
58
+ });
59
+ }
60
+ })
Supabase/functions/initiate-admin-transfer/.npmrc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Configuration for private npm package dependencies
2
+ # For more information on using private registries with Edge Functions, see:
3
+ # https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
Supabase/functions/initiate-admin-transfer/deno.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "imports": {}
3
+ }
Supabase/functions/initiate-admin-transfer/index.ts ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { createClient } from 'https://esm.sh/@supabase/supabase-js@2'
2
+ import { v4 as uuidv4 } from 'https://deno.land/std@0.106.0/uuid/mod.ts';
3
+
4
+ const corsHeaders = {
5
+ 'Access-Control-Allow-Origin': '*',
6
+ 'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
7
+ }
8
+
9
+ Deno.serve(async (req) => {
10
+ if (req.method === 'OPTIONS') {
11
+ return new Response('ok', { headers: corsHeaders })
12
+ }
13
+
14
+ try {
15
+ const { newAdminEmail } = await req.json();
16
+ if (!newAdminEmail) {
17
+ throw new Error("New admin's email is required.");
18
+ }
19
+
20
+ // Create an admin client to bypass RLS
21
+ const supabaseAdmin = createClient(
22
+ Deno.env.get('SUPABASE_URL')!,
23
+ Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!
24
+ );
25
+
26
+ // Get the current user from the request's auth token
27
+ const authHeader = req.headers.get('Authorization')!;
28
+ const jwt = authHeader.replace('Bearer ', '');
29
+ const { data: { user } } = await supabaseAdmin.auth.getUser(jwt);
30
+ if (!user) {
31
+ throw new Error("Could not identify the current user.");
32
+ }
33
+
34
+ // 1. Generate a secure, unique token for the transfer
35
+ const transferToken = uuidv4();
36
+ const expiryDate = new Date();
37
+ expiryDate.setHours(expiryDate.getHours() + 24); // Token is valid for 24 hours
38
+
39
+ // 2. Store the token and link it to the current user's company
40
+ // This assumes the 'companies' table has 'admin_transfer_token' and 'admin_transfer_expires_at' columns
41
+ const { data: profile, error: profileError } = await supabaseAdmin
42
+ .from('profiles').select('company_id').eq('id', user.id).single();
43
+ if (profileError || !profile) throw new Error("Could not find the user's company.");
44
+
45
+ const { error: updateError } = await supabaseAdmin
46
+ .from('companies')
47
+ .update({
48
+ admin_transfer_token: transferToken,
49
+ admin_transfer_expires_at: expiryDate.toISOString(),
50
+ })
51
+ .eq('id', profile.company_id);
52
+ if (updateError) throw new Error("Failed to store the transfer token.");
53
+
54
+ // 3. Send a magic link email to the new admin
55
+ // This link should point to a page in your app that handles the token verification
56
+ const transferUrl = `${Deno.env.get('SITE_URL')}/accept-admin-transfer?token=${transferToken}`;
57
+
58
+ const { error: emailError } = await supabaseAdmin.auth.admin.generateLink({
59
+ type: 'magiclink',
60
+ email: newAdminEmail,
61
+ options: {
62
+ redirectTo: transferUrl
63
+ }
64
+ });
65
+
66
+ if (emailError) {
67
+ throw new Error("Could not send invitation email.");
68
+ }
69
+
70
+ return new Response(JSON.stringify({ success: true, message: "Transfer invitation sent." }), {
71
+ headers: { ...corsHeaders, 'Content-Type': 'application/json' },
72
+ status: 200,
73
+ })
74
+ } catch (err) {
75
+ return new Response(JSON.stringify({ error: err.message }), {
76
+ headers: { ...corsHeaders, 'Content-Type': 'application/json' },
77
+ status: 400,
78
+ })
79
+ }
80
+ })
Supabase/functions/invite-first-admin/.npmrc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Configuration for private npm package dependencies
2
+ # For more information on using private registries with Edge Functions, see:
3
+ # https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
Supabase/functions/invite-first-admin/deno.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "imports": {}
3
+ }
Supabase/functions/invite-first-admin/index.ts ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { serve } from 'https://deno.land/std@0.177.0/http/server.ts'
2
+ import { createClient } from 'https://esm.sh/@supabase/supabase-js@2.39.8'
3
+ import { corsHeaders } from '../_shared/cors.ts'
4
+
5
+ serve(async (req) => {
6
+ if (req.method === 'OPTIONS') { return new Response('ok', { headers: corsHeaders }) }
7
+ try {
8
+ const { adminEmail, domain } = await req.json();
9
+ if (!adminEmail || !domain) throw new Error("Admin email and domain are required.");
10
+ if (adminEmail.split('@')[1] !== domain) throw new Error("Admin email must belong to the verified domain.");
11
+
12
+ const supabaseAdmin = createClient(Deno.env.get('SUPABASE_URL')!, Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!);
13
+
14
+ const { data: orgData, error: orgError } = await supabaseAdmin.from('organizations').select('id').eq('verified_domain', domain).eq('is_verified', true).single();
15
+ if (orgError || !orgData) throw new Error("Cannot send invite: Organization is not verified.");
16
+
17
+ const { error: inviteError } = await supabaseAdmin.auth.admin.inviteUserByEmail(adminEmail);
18
+ if (inviteError) throw inviteError;
19
+
20
+ return new Response(JSON.stringify({ success: true, message: `Invitation sent to ${adminEmail}.` }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' } });
21
+ } catch (error) {
22
+ return new Response(JSON.stringify({ error: error.message }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 400 });
23
+ }
24
+ })
Supabase/functions/otp/.npmrc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Configuration for private npm package dependencies
2
+ # For more information on using private registries with Edge Functions, see:
3
+ # https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
Supabase/functions/otp/deno.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "imports": {}
3
+ }
Supabase/functions/otp/index.ts ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { serve } from "https://deno.land/std@0.168.0/http/server.ts";
2
+ import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
3
+
4
+ const corsHeaders = {
5
+ 'Access-Control-Allow-Origin': '*',
6
+ 'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
7
+ };
8
+
9
+ serve(async (req) => {
10
+ if (req.method === 'OPTIONS') {
11
+ return new Response('ok', { headers: corsHeaders });
12
+ }
13
+
14
+ try {
15
+ // 1. Init Supabase Clients
16
+ const supabaseAdmin = createClient(
17
+ Deno.env.get('SUPABASE_URL') ?? '',
18
+ Deno.env.get('SUPABASE_SERVICE_ROLE_KEY') ?? ''
19
+ );
20
+
21
+ const authHeader = req.headers.get('Authorization')!;
22
+ const supabaseClient = createClient(
23
+ Deno.env.get('SUPABASE_URL') ?? '',
24
+ Deno.env.get('SUPABASE_ANON_KEY') ?? '',
25
+ { global: { headers: { Authorization: authHeader } } }
26
+ );
27
+
28
+ // 2. Auth Check
29
+ const { data: { user }, error: authError } = await supabaseClient.auth.getUser();
30
+ if (authError || !user) throw new Error("Unauthorized");
31
+
32
+ const { action, userCode } = await req.json();
33
+
34
+ // ==========================================
35
+ // ACTION: SEND SMS (VIA TWILIO)
36
+ // ==========================================
37
+ if (action === 'send') {
38
+ const { data: profile } = await supabaseAdmin
39
+ .from('profiles')
40
+ .select('phone')
41
+ .eq('id', user.id)
42
+ .single();
43
+
44
+ if (!profile?.phone) throw new Error("No phone number found in profile.");
45
+
46
+ const phone = profile.phone;
47
+ const otp = Math.floor(100000 + Math.random() * 900000).toString();
48
+ const expiresAt = new Date(Date.now() + 5 * 60 * 1000).toISOString();
49
+
50
+ // Upsert OTP to DB
51
+ const { error: upsertError } = await supabaseAdmin
52
+ .from('otp_verifications')
53
+ .upsert({ phone, otp_code: otp, expires_at: expiresAt, attempts_count: 0 }, { onConflict: 'phone' });
54
+
55
+ if (upsertError) throw upsertError;
56
+
57
+ // --- TWILIO SENDING LOGIC STARTS HERE ---
58
+ const accountSid = Deno.env.get("TWILIO_ACCOUNT_SID");
59
+ const authToken = Deno.env.get("TWILIO_AUTH_TOKEN");
60
+ const fromNumber = Deno.env.get("TWILIO_PHONE_NUMBER");
61
+
62
+ if (!accountSid || !authToken || !fromNumber) {
63
+ throw new Error("Twilio secrets are missing in Supabase.");
64
+ }
65
+
66
+ // Format parameters for Twilio API
67
+ const params = new URLSearchParams();
68
+ params.append('To', phone);
69
+ params.append('From', fromNumber);
70
+ params.append('Body', `Your Verification Code is: ${otp}`);
71
+
72
+ console.log(`Sending SMS to ${phone}...`);
73
+
74
+ const twilioRes = await fetch(
75
+ `https://api.twilio.com/2010-04-01/Accounts/${accountSid}/Messages.json`,
76
+ {
77
+ method: "POST",
78
+ headers: {
79
+ "Authorization": `Basic ${btoa(`${accountSid}:${authToken}`)}`,
80
+ "Content-Type": "application/x-www-form-urlencoded",
81
+ },
82
+ body: params,
83
+ }
84
+ );
85
+
86
+ if (!twilioRes.ok) {
87
+ const errorText = await twilioRes.text();
88
+ console.error("Twilio Error:", errorText);
89
+ throw new Error("Failed to send SMS. Check server logs.");
90
+ }
91
+ // --- TWILIO LOGIC ENDS HERE ---
92
+
93
+ return new Response(
94
+ JSON.stringify({ message: "OTP sent successfully" }),
95
+ { headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 200 }
96
+ );
97
+ }
98
+
99
+ // ==========================================
100
+ // ACTION: VERIFY
101
+ // ==========================================
102
+ if (action === 'verify') {
103
+ if (!userCode) throw new Error("Missing OTP code");
104
+
105
+ const { data: profile } = await supabaseAdmin.from('profiles').select('phone').eq('id', user.id).single();
106
+ const phone = profile?.phone;
107
+
108
+ const { data: record } = await supabaseAdmin.from('otp_verifications').select('*').eq('phone', phone).single();
109
+
110
+ if (!record) throw new Error("Invalid or expired OTP.");
111
+ if (new Date() > new Date(record.expires_at)) throw new Error("OTP has expired.");
112
+ if (record.attempts_count >= 3) throw new Error("Too many attempts.");
113
+
114
+ if (record.otp_code !== userCode) {
115
+ await supabaseAdmin.from('otp_verifications').update({ attempts_count: record.attempts_count + 1 }).eq('phone', phone);
116
+ throw new Error("Incorrect OTP code.");
117
+ }
118
+
119
+ // Success
120
+ await supabaseAdmin.from('profiles').update({ is_phone_verified: true }).eq('id', user.id);
121
+ await supabaseAdmin.from('otp_verifications').delete().eq('phone', phone);
122
+
123
+ return new Response(
124
+ JSON.stringify({ message: "Phone verified successfully!" }),
125
+ { headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 200 }
126
+ );
127
+ }
128
+
129
+ return new Response(JSON.stringify({ error: "Invalid Action" }), { status: 400, headers: corsHeaders });
130
+
131
+ } catch (error) {
132
+ return new Response(
133
+ JSON.stringify({ error: error.message }),
134
+ { headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 400 }
135
+ );
136
+ }
137
+ });
Supabase/functions/send-interview-email/.npmrc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Configuration for private npm package dependencies
2
+ # For more information on using private registries with Edge Functions, see:
3
+ # https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
Supabase/functions/send-interview-email/deno.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "imports": {}
3
+ }
Supabase/functions/send-interview-email/index.ts ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // supabase/functions/send-interview-email/index.ts
2
+ import { serve } from "https://deno.land/std@0.168.0/http/server.ts";
3
+ import { Resend } from "npm:resend";
4
+
5
+ const resend = new Resend(Deno.env.get("RESEND_API_KEY"));
6
+
7
+ const corsHeaders = {
8
+ "Access-Control-Allow-Origin": "*",
9
+ "Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
10
+ };
11
+
12
+ serve(async (req) => {
13
+ if (req.method === "OPTIONS") return new Response("ok", { headers: corsHeaders });
14
+
15
+ try {
16
+ const { candidateName, candidateEmail, date, time, meetingLink, role } = await req.json();
17
+
18
+ const { data, error } = await resend.emails.send({
19
+ from: "Acme HR <onboarding@resend.dev>", // Change this to your verified domain if you have one
20
+ to: [candidateEmail],
21
+ subject: `Interview Invitation: ${role}`,
22
+ html: `
23
+ <div style="font-family: sans-serif; padding: 20px;">
24
+ <h1>Hi ${candidateName},</h1>
25
+ <p>We are pleased to invite you to a <strong>Technical Interview</strong> for the <strong>${role}</strong> position.</p>
26
+
27
+ <div style="background: #f3f4f6; padding: 15px; border-radius: 8px; margin: 20px 0;">
28
+ <p style="margin: 5px 0;"><strong>📅 Date:</strong> ${date}</p>
29
+ <p style="margin: 5px 0;"><strong>⏰ Time:</strong> ${time}</p>
30
+ <p style="margin: 5px 0;"><strong>🔗 Link:</strong> <a href="${meetingLink}">${meetingLink}</a></p>
31
+ </div>
32
+
33
+ <p>Please join 5 minutes early.</p>
34
+ <p>Best,<br>Hiring Team</p>
35
+ </div>
36
+ `,
37
+ });
38
+
39
+ if (error) throw error;
40
+
41
+ return new Response(JSON.stringify(data), {
42
+ headers: { ...corsHeaders, "Content-Type": "application/json" },
43
+ status: 200,
44
+ });
45
+ } catch (error) {
46
+ return new Response(JSON.stringify({ error: error.message }), {
47
+ headers: { ...corsHeaders, "Content-Type": "application/json" },
48
+ status: 500,
49
+ });
50
+ }
51
+ });
Supabase/functions/verify-domain/.npmrc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Configuration for private npm package dependencies
2
+ # For more information on using private registries with Edge Functions, see:
3
+ # https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
Supabase/functions/verify-domain/deno.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "imports": {}
3
+ }
Supabase/functions/verify-domain/index.ts ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { serve } from 'https://deno.land/std@0.177.0/http/server.ts'
2
+ import { createClient } from 'https://esm.sh/@supabase/supabase-js@2.39.8'
3
+ import { corsHeaders } from '../_shared/cors.ts'
4
+
5
+ serve(async (req) => {
6
+ if (req.method === 'OPTIONS') { return new Response('ok', { headers: corsHeaders }) }
7
+ try {
8
+ const { domain } = await req.json();
9
+ if (!domain) throw new Error("Domain is required.");
10
+
11
+ const supabaseAdmin = createClient(Deno.env.get('SUPABASE_URL')!, Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!);
12
+
13
+ const { data: orgData, error: orgError } = await supabaseAdmin.from('organizations').select('verification_token, is_verified').eq('verified_domain', domain).single();
14
+ if (orgError) throw new Error("Could not find an organization for this domain.");
15
+ if (orgData.is_verified) return new Response(JSON.stringify({ success: true, message: 'Domain is already verified.' }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' } });
16
+
17
+ const expectedToken = `${orgData.verification_token}`;
18
+ let isVerified = false;
19
+ const txtRecords = await Deno.resolveDns(domain, "TXT");
20
+
21
+ for (const record of txtRecords) { if (record.includes(expectedToken)) { isVerified = true; break; } }
22
+
23
+ if (isVerified) {
24
+ await supabaseAdmin.from('organizations').update({ is_verified: true, verification_token: null // <-- The added line }).eq('verified_domain', domain);
25
+ return new Response(JSON.stringify({ success: true, message: 'Domain verified!' }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' } });
26
+ } else {
27
+ throw new Error("Verification failed. TXT record not found or has not propagated yet.");
28
+ }
29
+ } catch (error) {
30
+ return new Response(JSON.stringify({ error: error.message }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 400 });
31
+ }
32
+ })
backend/.env ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENAI_API_KEY=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
2
+ GEMINI_API_KEY=AIzaSyA0lgoagdthXdxR_nMhqI5FSu5crY0gd7Y
3
+ # Supabase configuration (fill these with your project values)
4
+ # SUPABASE_URL: e.g. https://your-project.supabase.co
5
+ # SUPABASE_KEY: service role key or anon key (prefer service role for server-side ops)
6
+ SUPABASE_URL=https://obhychdzwbytlzwrjrbl.supabase.co
7
+ SUPABASE_SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im9iaHljaGR6d2J5dGx6d3JqcmJsIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTc1ODg4MTA1OSwiZXhwIjoyMDc0NDU3MDU5fQ.8_-9OY1Ae89TOKMd8foK3ojilBhrHWhg_w2cz-YWsCA
8
+ # Optional: storage bucket and prefix to fetch resumes from
9
+ SUPABASE_BUCKET=resume
10
+ SUPABASE_PREFIX=""
11
+ # Set to 1/true/yes to enable automatic fetching from Supabase when running `run_pipeline.py`
12
+ USE_SUPABASE_RAW=1
backend/.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Secrets
2
+
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.pyc
7
+ *.pyo
8
+ *.pyd
9
+ .Python
10
+ env/
11
+ venv/
12
+ .venv/
13
+ pip-log.txt
14
+ pip-delete-this-directory.txt
15
+ .tox/
16
+ .coverage
17
+ .coverage.*
18
+ .cache
19
+ nosetests.xml
20
+ coverage.xml
21
+ *.cover
22
+ *.log
23
+ .pytest_cache/
24
+
25
+ # Data
26
+ data/
backend/add_experience_to_embeddings.sql ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ -- Add the missing 'experience' column to profile_embeddings
3
+ -- BAAI/bge-m3 uses 1024 dimensions
4
+ alter table profile_embeddings
5
+ add column if not exists experience vector(1024);
backend/add_projects_to_profiles.sql ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ -- Add 'projects' column to profiles table to store extracted project details
3
+ -- It will store a JSONB array of objects: [{ title, technologies_used, description }]
4
+ alter table profiles
5
+ add column if not exists projects jsonb default '[]'::jsonb;
backend/api.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api.py
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # Load env BEFORE importing modules that depend on it
6
+ load_dotenv()
7
+
8
+ from fastapi import FastAPI, HTTPException, UploadFile, Form, File
9
+ from pydantic import BaseModel
10
+ from supabase import create_client
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from supabase_ingest import process_resume
13
+ from src.extraction.job_extractor import process_single_job
14
+ from src.services.ats_service import analyze_ats_compatibility
15
+
16
+
17
+ app = FastAPI()
18
+
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=["*"], # Allow all origins for dev; restrict in prod
22
+ allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+ # Setup Supabase Client
28
+ SUPABASE_URL = os.environ.get("SUPABASE_URL")
29
+ # Use Service Role Key if available to bypass RLS
30
+ SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
31
+
32
+ if not SUPABASE_URL or not SUPABASE_KEY:
33
+ raise RuntimeError("SUPABASE_URL and SUPABASE_KEY (or SUPABASE_SERVICE_ROLE_KEY) must be set in .env")
34
+
35
+ client = create_client(SUPABASE_URL, SUPABASE_KEY)
36
+
37
+ # Define the data we expect from the frontend
38
+ class ResumeRequest(BaseModel):
39
+ user_id: str
40
+ file_path: str # e.g., "user_123/resume.pdf"
41
+
42
+ @app.post("/process-resume")
43
+ async def process_resume_endpoint(request: ResumeRequest):
44
+ print(f"🔔 Signal received: Process resume for {request.user_id}")
45
+
46
+ try:
47
+ # Delegate everything to the unified function
48
+ extracted_data = process_resume(client, request.user_id, request.file_path)
49
+ return {"status": "success", "data": extracted_data}
50
+
51
+ except Exception as e:
52
+ print(f"❌ Error: {e}")
53
+ raise HTTPException(status_code=500, detail=str(e))
54
+
55
+ # ---------------------------------------------------------------------
56
+ # WEBHOOK ENDPOINT (Called by Supabase)
57
+ # ---------------------------------------------------------------------
58
+
59
+ from typing import Dict, Any, Optional
60
+
61
+ class StorageEventRequest(BaseModel):
62
+ type: str
63
+ table: str
64
+ record: Dict[str, Any]
65
+ schema: str
66
+ old_record: Optional[Dict[str, Any]] = None
67
+
68
+ @app.post("/webhook/storage")
69
+ async def storage_webhook(request: StorageEventRequest):
70
+ """
71
+ Handles Database Webhooks from Supabase (storage.objects insert).
72
+ """
73
+ print(f"🔔 Webhook received: {request.type} on {request.table}")
74
+
75
+ # We only care about INSERTs or UPDATEs (overwrites) to the 'resume' bucket
76
+ if request.type not in ["INSERT", "UPDATE"] or request.table != "objects":
77
+ return {"status": "ignored"}
78
+
79
+ # Extract file details from the record
80
+ # Object path example: "user_123/123456_resume.pdf"
81
+ file_path = request.record.get("name")
82
+ bucket_id = request.record.get("bucket_id")
83
+
84
+ # Check bucket
85
+ if bucket_id != "resume":
86
+ print(f"⚠️ Ignoring upload to bucket: {bucket_id}")
87
+ return {"status": "ignored", "reason": "wrong bucket"}
88
+
89
+ # Extract User ID (assuming folder structure: user_id/filename)
90
+ try:
91
+ user_id = file_path.split("/")[0]
92
+ except Exception:
93
+ print(f"❌ Could not extract user_id from {file_path}")
94
+ return {"status": "error", "message": "invalid file path structure"}
95
+
96
+ print(f"▶️ Triggering processing for {file_path}")
97
+
98
+ # Call the processing logic
99
+ try:
100
+ process_resume(client, user_id, file_path)
101
+ return {"status": "success"}
102
+ except Exception as e:
103
+ print(f"❌ Processing failed: {e}")
104
+ raise HTTPException(status_code=500, detail=str(e))
105
+
106
+
107
+ @app.post("/webhook/jobs")
108
+ async def jobs_webhook(request: StorageEventRequest):
109
+ """
110
+ Handles Database Webhooks from Supabase (jobs table UPDATE/INSERT).
111
+ """
112
+ print(f"🔔 Webhook received: {request.type} on {request.table}")
113
+
114
+ if request.table != "jobs":
115
+ return {"status": "ignored", "reason": "wrong table"}
116
+
117
+ # We care about INSERT and UPDATE
118
+ # For UPDATE, we might want to check if description changed, but for now we runs it anyway
119
+
120
+ new_record = request.record
121
+ job_id = new_record.get("id")
122
+ description = new_record.get("description")
123
+ experience_level = new_record.get("experience_level")
124
+
125
+ if not job_id:
126
+ print("❌ Webhook missing job_id")
127
+ return {"status": "error", "message": "missing id"}
128
+
129
+ print(f"▶️ Triggering job extraction for Job ID: {job_id}")
130
+
131
+ try:
132
+ # Re-use global client from line 32
133
+ process_single_job(client, job_id, description, experience_level)
134
+ return {"status": "success"}
135
+ except Exception as e:
136
+ print(f"❌ Job processing failed: {e}")
137
+ raise HTTPException(status_code=500, detail=str(e))
138
+
139
+ @app.post("/analyze-ats")
140
+ async def analyze_ats_endpoint(
141
+ resume: UploadFile = File(...),
142
+ job_description: str = Form(...)
143
+ ):
144
+ """
145
+ Real-time ATS analysis endpoint.
146
+ Does not save to DB (unless you want to add that logic).
147
+ """
148
+ print(f"🔍 Analyzing ATS compatibility for: {resume.filename}")
149
+ try:
150
+ result = await analyze_ats_compatibility(resume, job_description)
151
+ return {"status": "success", "data": result}
152
+ except Exception as e:
153
+ print(f"❌ ATS Analysis failed: {e}")
154
+ raise HTTPException(status_code=500, detail=str(e))
155
+
156
+
157
+ # Run with: uvicorn api:app --reload
backend/create_profile_embeddings.sql ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Enable the pgvector extension to work with embedding vectors
2
+ create extension if not exists vector;
3
+
4
+ -- Create a table to store embeddings for each profile column
5
+ -- We use 1024 dimensions for the BAAI/bge-m3 model
6
+ create table if not exists profile_embeddings (
7
+ id uuid references profiles(id) on delete cascade primary key,
8
+ headline vector(1024),
9
+ summary vector(1024),
10
+ skills vector(1024),
11
+ technical_skills vector(1024),
12
+ experience vector(1024),
13
+ certifications vector(1024),
14
+ languages vector(1024),
15
+ created_at timestamp with time zone default timezone('utc'::text, now()) not null,
16
+ updated_at timestamp with time zone default timezone('utc'::text, now()) not null
17
+ );
18
+
19
+ -- Enable Row Level Security (RLS)
20
+ alter table profile_embeddings enable row level security;
21
+
22
+ -- Create policies (Adjust based on your actual auth requirements)
23
+ -- Allow read access to everyone (or authenticated users)
24
+ create policy "Allow read access for all users"
25
+ on profile_embeddings for select
26
+ using ( true );
27
+
28
+ -- Allow update/insert only for service_role or the user who owns the profile
29
+ -- (Assuming auth.uid() matches the profile id)
30
+ create policy "Users can update their own embeddings"
31
+ on profile_embeddings for all
32
+ using ( auth.uid() = id );
backend/debug_payload.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "81185bdc-85be-4ff2-99c7-16cf8356cb51",
3
+ "resume_url": "81185bdc-85be-4ff2-99c7-16cf8356cb51/resume.pdf",
4
+ "file_hash": "f6f9d1e0b3badc01329126aa9f249a3e26f1ba12e26d274de0323c359faa1c13",
5
+ "processed": true,
6
+ "updated_at": "now()",
7
+ "full_name": "med Raffi",
8
+ "summary": "Computer Science student proficient in Python, Java, and C with strong skills in Object-Oriented Programming. Experienced in software development and version control using Git. Adaptable team player focused on solving complex technical challenges.",
9
+ "phone": "+9195390771",
10
+ "email": "saheedmuhammedraffi@gmail.com",
11
+ "skills": [
12
+ "Communication",
13
+ "Teamwork",
14
+ "Adaptability",
15
+ "Analytical Thinking"
16
+ ],
17
+ "technical_skills": "Python, Java, C, SQL, HTML, CSS, JavaScript, Flask, React, Pandas, Scikit-learn, NumPy, Git, VSCode, GoogleColab, Docker, TensorFlow",
18
+ "education": [
19
+ {
20
+ "course": "B.Tech in Computer Science and Engineering",
21
+ "institution": "Carmel College of Engineering and Technology, Alappuzha",
22
+ "year": "2022 Present"
23
+ },
24
+ {
25
+ "course": "Higher Secondary Education",
26
+ "institution": "S.D.V. English Medium Higher Secondary School, Alappuzha",
27
+ "year": null
28
+ }
29
+ ],
30
+ "work_experience": [
31
+ {
32
+ "role": "AI/ML Intern",
33
+ "company": "ICT Academy of Kerala, Trivandrum",
34
+ "years": "Jun 2025 - Jul 2025",
35
+ "description": "Underwent a 1-month internship on Artificial Intelligence and Machine Learning. Collaborated with a 5-member team to deploy a prototype ML model tested on real-world datasets."
36
+ },
37
+ {
38
+ "role": "Webmaster",
39
+ "company": "IEEE Computer Society",
40
+ "years": "July 2025 Present",
41
+ "description": "Developed a responsive web portal and admin dashboard to streamline real-time event tracking and member registration."
42
+ },
43
+ {
44
+ "role": "TEDxCCET Curation Lead",
45
+ "company": "Dept. of Computer Science, CCET",
46
+ "years": "Nov 2025 Present",
47
+ "description": "Manage speaker logistics, schedules, and deliverables to ensure strict adherence to event timelines. Coordinate technical requirements and stage cues between speakers and the production team."
48
+ }
49
+ ],
50
+ "projects": [
51
+ {
52
+ "tech_stack": [
53
+ "React",
54
+ "Supabase"
55
+ ],
56
+ "description": "A full-stack milk management and distribution system that automates milk collection, farmer payments, billing, and delivery tracking through a centralized platform."
57
+ },
58
+ {
59
+ "tech_stack": [
60
+ "Python",
61
+ "TensorFlow",
62
+ "Flask",
63
+ "React"
64
+ ],
65
+ "description": "Developed an LSTM-based model to forecast short-term stock prices using live data. Integrated the trained model into a Flask API with a React interface for real-time trend prediction."
66
+ }
67
+ ],
68
+ "certifications": "AIML Internship ICT Academy of Kerala 2025, Python Foundation Certification Springboard 2025, Programming in Java NPTEL 2024"
69
+ }
backend/debug_resume.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Candidate Name: Jane Doe
2
+ Email: jane@example.com
3
+ Projects:
4
+ 1. E-Commerce App
5
+ Tech Stack: React, Node.js
6
+ Description: A shopping site.
backend/requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================== Core utilities ==================
2
+ python-dotenv>=1.0.0
3
+ pandas>=2.0.0
4
+ tqdm>=4.66.0
5
+
6
+ # ================== PDF / DOC processing ==================
7
+ pypdf>=3.0.0
8
+ pdfplumber>=0.10.0
9
+ python-docx>=0.8.11
10
+ unicodedata2>=0.7.2
11
+
12
+ # ================== NLP preprocessing ==================
13
+ nltk>=3.8.1
14
+
15
+ # ================== Hugging Face / ML ==================
16
+ transformers>=4.44.0
17
+ torch>=2.2.0
18
+ sentence-transformers>=2.2.2
19
+ datasets>=2.19.0
20
+ accelerate>=0.30.0
21
+
22
+ # ================== APIs ==================
23
+ openai>=1.30.0
24
+ supabase>=2.0.0
25
+ fastapi>=0.109.0
26
+ uvicorn>=0.27.0
27
+ python-multipart>=0.0.9
28
+ google-genai>=0.2.0
backend/src/__init__.py ADDED
File without changes
backend/src/embeddings/__init__.py ADDED
File without changes
backend/src/embeddings/debug_embedding_storage.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sys
3
+ import os
4
+ import time
5
+
6
+ # Add 'backend' directory to path so we can import 'supabase_ingest' directly
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
8
+
9
+ from supabase_ingest import safe_generate_and_store_embeddings, client
10
+
11
+ # Mock data
12
+ user_id = "test_user_debug_123"
13
+ extracted_data = {
14
+ "headline": "Debug Engineer",
15
+ "summary": "This is a test summary for debugging.",
16
+ "skills": "Debug, Python", # DB stores as string
17
+ "technical_skills": "SQL, Vector DB", # DB stores as string
18
+ "certifications": "",
19
+ "languages": "English" # DB stores as string
20
+ }
21
+
22
+ print(f"DEBUG: Testing embedding storage for User ID: {user_id}")
23
+
24
+ # 1. Ensure user exists in profiles first (FK constraint)
25
+ try:
26
+ print("DEBUG: Ensuring profile exists...")
27
+ # UPSERT the mock data into the profiles table so the function can fetch it
28
+ profile_payload = {
29
+ "id": user_id,
30
+ "full_name": "Debug User",
31
+ "email": "debug@example.com",
32
+ "updated_at": "now()",
33
+ # Add the fields we expect to be there
34
+ "headline": extracted_data["headline"],
35
+ "summary": extracted_data["summary"],
36
+ "skills": extracted_data["skills"],
37
+ "technical_skills": extracted_data["technical_skills"],
38
+ "certifications": extracted_data["certifications"],
39
+ "languages": extracted_data["languages"]
40
+ }
41
+ client.table("profiles").upsert(profile_payload).execute()
42
+ print("DEBUG: Profile upserted.")
43
+ except Exception as e:
44
+ print(f"❌ Failed to create test profile: {e}")
45
+ sys.exit(1)
46
+
47
+ # 2. Run the function
48
+ print("DEBUG: Running safe_generate_and_store_embeddings...")
49
+ # Now it fetches from DB internally, so we don't pass extracted_data
50
+ safe_generate_and_store_embeddings(client, user_id)
51
+
52
+ # 3. Check if it exists
53
+ try:
54
+ print("DEBUG: Verifying storage...")
55
+ resp = client.table("profile_embeddings").select("*").eq("id", user_id).execute()
56
+ if resp.data:
57
+ print("✅ SUCCESS: Embedding record found!")
58
+ print(f"Data keys: {resp.data[0].keys()}")
59
+ else:
60
+ print("❌ FAILURE: No record found in profile_embeddings.")
61
+ except Exception as e:
62
+ print(f"❌ Verification failed: {e}")
backend/src/embeddings/job_embed.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from typing import List
4
+ from dotenv import load_dotenv
5
+ from supabase import create_client
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ # Load env
9
+ load_dotenv()
10
+
11
+ SUPABASE_URL = os.environ.get("SUPABASE_URL")
12
+ SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
13
+
14
+ # Singleton model (same pattern as profile code)
15
+ _model = None
16
+
17
+ def get_model():
18
+ global _model
19
+ if _model is None:
20
+ print("📥 Loading BAAI/bge-m3 model for job embeddings...")
21
+ _model = SentenceTransformer("BAAI/bge-m3")
22
+ return _model
23
+
24
+ def get_supabase():
25
+ if not SUPABASE_URL or not SUPABASE_KEY:
26
+ print("❌ Missing Supabase credentials for job embeddings.")
27
+ return None
28
+ return create_client(SUPABASE_URL, SUPABASE_KEY)
29
+
30
+ # -------- Embedding helpers (IDENTICAL LOGIC) --------
31
+
32
+ def generate_embedding(text: str) -> List[float]:
33
+ if not text or not text.strip():
34
+ return [0.0] * 1024
35
+
36
+ model = get_model()
37
+ embedding = model.encode(text, normalize_embeddings=True)
38
+ return embedding.tolist()
39
+
40
+ def generate_list_embedding(items: List[str]) -> List[float]:
41
+ if not items:
42
+ return [0.0] * 1024
43
+
44
+ model = get_model()
45
+ embeddings = model.encode(items, normalize_embeddings=True)
46
+ mean_embedding = np.mean(embeddings, axis=0)
47
+ return mean_embedding.tolist()
48
+
49
+ # ----------------------------------------------------
50
+
51
+ def safe_generate_and_store_job_embeddings(client, job_id: str) -> None:
52
+ """
53
+ Fetches job entities, generates entity-wise embeddings,
54
+ and upserts them into job_embeddings table.
55
+ """
56
+ print(f"🧬 Generating job embeddings for Job: {job_id}")
57
+
58
+ # 1. Fetch job entities
59
+ resp = client.table("job_entities") \
60
+ .select("*") \
61
+ .eq("job_id", job_id) \
62
+ .execute()
63
+
64
+ if not resp.data:
65
+ print(f"⚠️ Job entities not found for job_id={job_id}")
66
+ return
67
+
68
+ entities = resp.data[0]
69
+
70
+ # 2. Parse list fields safely (same pattern)
71
+ def parse_list(val):
72
+ if not val:
73
+ return []
74
+ if isinstance(val, list):
75
+ return val
76
+ if isinstance(val, str):
77
+ return [x.strip() for x in val.split(",") if x.strip()]
78
+ return []
79
+
80
+ skills = parse_list(entities.get("skills"))
81
+ technical_skills = parse_list(entities.get("technical_skills"))
82
+ tools = parse_list(entities.get("tools"))
83
+ certifications = parse_list(entities.get("certifications"))
84
+
85
+ experience = entities.get("experience") or ""
86
+ education = entities.get("education") or ""
87
+
88
+ try:
89
+ # 3. Generate embeddings (ENTITY-WISE)
90
+ payload = {
91
+ "job_id": job_id,
92
+ "skills": generate_list_embedding(skills),
93
+ "technical_skills": generate_list_embedding(technical_skills),
94
+ "tools": generate_list_embedding(tools),
95
+ "experience": generate_embedding(experience),
96
+ "education": generate_embedding(education),
97
+ "certifications": generate_list_embedding(certifications),
98
+ "updated_at": "now()"
99
+ }
100
+
101
+ # 4. Upsert into job_embeddings
102
+ client.table("job_embeddings").upsert(payload).execute()
103
+ print(f"✅ Job embeddings stored for job_id={job_id}")
104
+
105
+ except Exception as e:
106
+ print(f"❌ Job embedding generation failed: {e}")
107
+
108
+
backend/src/embeddings/local_embedder.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import json
4
+ import numpy as np
5
+ from typing import List, Any
6
+ from dotenv import load_dotenv
7
+ from supabase import create_client
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ # Load env
11
+ load_dotenv()
12
+
13
+ SUPABASE_URL = os.environ.get("SUPABASE_URL")
14
+ SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
15
+
16
+ # Initialize Model (Globals are bad but efficient for serverless-ish/script use)
17
+ # Using a singleton pattern to avoid reloading model on every call if imported
18
+ _model = None
19
+
20
+ def get_model():
21
+ global _model
22
+ if _model is None:
23
+ print("📥 Loading BAAI/bge-m3 model...")
24
+ _model = SentenceTransformer('BAAI/bge-m3')
25
+ return _model
26
+
27
+ def get_supabase():
28
+ if not SUPABASE_URL or not SUPABASE_KEY:
29
+ print("❌ Missing Supabase credentials for embeddings.")
30
+ return None
31
+ return create_client(SUPABASE_URL, SUPABASE_KEY)
32
+
33
+ def generate_embedding(text: str) -> List[float]:
34
+ if not text or not text.strip():
35
+ return [0.0] * 1024 # BGE-M3 is 1024d
36
+
37
+ model = get_model()
38
+ # BGE-M3 returns 1024 dim
39
+ embedding = model.encode(text, normalize_embeddings=True)
40
+ return embedding.tolist()
41
+
42
+ def generate_list_embedding(items: List[str]) -> List[float]:
43
+ if not items:
44
+ return [0.0] * 1024
45
+
46
+ model = get_model()
47
+ embeddings = model.encode(items, normalize_embeddings=True)
48
+ # Mean pooling
49
+ mean_embedding = np.mean(embeddings, axis=0)
50
+ return mean_embedding.tolist()
51
+
52
+ def safe_generate_and_store_embeddings(client, user_id: str) -> None:
53
+ """
54
+ Fetches profile data, generates embeddings, and upserts to profile_embeddings.
55
+ """
56
+ print(f"🧬 Generating embeddings for User: {user_id}")
57
+
58
+ # 1. Fetch Profile
59
+ resp = client.table("profiles").select("*").eq("id", user_id).execute()
60
+ if not resp.data:
61
+ print(f"⚠️ Profile not found for {user_id}")
62
+ return
63
+
64
+ profile = resp.data[0]
65
+
66
+ # 2. Extract Fields
67
+ # Text fields
68
+ summary = profile.get("summary") or ""
69
+ headline = profile.get("headline") or ""
70
+ role = profile.get("role") or ""
71
+
72
+ # Lists (CSV or Array) - Handle both just in case
73
+ def parse_list(val):
74
+ if not val: return []
75
+ if isinstance(val, list): return val
76
+ if isinstance(val, str): return [x.strip() for x in val.split(",") if x.strip()]
77
+ return []
78
+
79
+ skills = parse_list(profile.get("skills"))
80
+ tech_skills = parse_list(profile.get("technical_skills"))
81
+ # For experience and education, we might need more complex parsing if stored as JSONB
82
+ # But for now let's assume simple text representation or skip if complex JSON
83
+ # If experience is JSONB, we'll serialize it to text for embedding
84
+ experience_raw = profile.get("work_experience") or []
85
+ if isinstance(experience_raw, list):
86
+ # It's a list of objects or strings. Convert to list of strings.
87
+ experience_texts = []
88
+ for item in experience_raw:
89
+ if isinstance(item, dict):
90
+ # Flatten: "Role at Company (Year): Description"
91
+ role_ = item.get("role") or ""
92
+ comp_ = item.get("company") or ""
93
+ desc_ = item.get("description") or ""
94
+ text = f"{role_} at {comp_}. {desc_}"
95
+ experience_texts.append(text)
96
+ elif isinstance(item, str):
97
+ experience_texts.append(item)
98
+ experience = experience_texts
99
+ else:
100
+ experience = []
101
+
102
+ # 3. Generate Embeddings (Extra fields for completeness)
103
+ certifications = parse_list(profile.get("certifications"))
104
+
105
+ try:
106
+ current_position_emb = generate_embedding(f"{role} {headline}")
107
+ summary_emb = generate_embedding(summary)
108
+ skills_emb = generate_list_embedding(skills)
109
+ technical_skills_emb = generate_list_embedding(tech_skills)
110
+ experience_emb = generate_list_embedding(experience)
111
+ certifications_emb = generate_list_embedding(certifications)
112
+
113
+ # 4. Upsert
114
+ # Matches columns in create_profile_embeddings.sql
115
+ payload = {
116
+ "id": user_id,
117
+ "headline": current_position_emb,
118
+ "summary": summary_emb,
119
+ "skills": skills_emb,
120
+ "technical_skills": technical_skills_emb,
121
+ "experience": experience_emb,
122
+ "certifications": certifications_emb,
123
+ "updated_at": "now()"
124
+ }
125
+
126
+ client.table("profile_embeddings").upsert(payload).execute()
127
+ print(f"✅ Embeddings stored for {user_id}")
128
+
129
+ except Exception as e:
130
+ print(f"❌ Embedding generation failed: {e}")
131
+
132
+ if __name__ == "__main__":
133
+ # Test run
134
+ sb = get_supabase()
135
+ if sb:
136
+ # Replace with a valid ID for testing if needed
137
+ pass
backend/src/embeddings/process_all_profiles.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sys
3
+ import os
4
+ import time
5
+
6
+ # Add backend to path
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
8
+
9
+ from supabase_ingest import client, safe_generate_and_store_embeddings
10
+
11
+ def process_all_profiles():
12
+ print("🔍 Fetching all user IDs from 'profiles' table...")
13
+
14
+ try:
15
+ # Fetch all profiles (just IDs needed to trigger the function)
16
+ response = client.table("profiles").select("id").execute()
17
+
18
+ if not response.data:
19
+ print("⚠️ No profiles found in database.")
20
+ return
21
+
22
+ profiles = response.data
23
+ total = len(profiles)
24
+ print(f"✅ Found {total} profiles to process.")
25
+
26
+ for i, profile in enumerate(profiles):
27
+ user_id = profile['id']
28
+ print(f"\n[{i+1}/{total}] Processing User ID: {user_id}")
29
+
30
+ # This function now handles:
31
+ # 1. Fetching the full profile data from DB
32
+ # 2. Parsing CSV lists
33
+ # 3. Generating BGE-M3 embeddings
34
+ # 4. Upserting to profile_embeddings
35
+ safe_generate_and_store_embeddings(client, user_id)
36
+
37
+ # Small delay to be nice to the CPU/API
38
+ # time.sleep(0.1)
39
+
40
+ print("\n🎉 Batch processing complete!")
41
+
42
+ except Exception as e:
43
+ print(f"❌ Error fetching profiles: {e}")
44
+
45
+ if __name__ == "__main__":
46
+ process_all_profiles()
backend/src/embeddings/test_embedder.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sys
3
+ import os
4
+ import numpy as np
5
+
6
+ # Add backend to path
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
8
+
9
+ from backend.src.embeddings.local_embedder import generate_embeddings
10
+
11
+ sample_data = {
12
+ "headline": "Senior Software Engineer",
13
+ "summary": "Experienced in Python and AI.",
14
+ "skills": ["Communication", "Leadership", "Agile"],
15
+ "technical_skills": ["Python", "FastAPI", "React"],
16
+ "certifications": [], # Empty list
17
+ "languages": ["English", "Spanish"]
18
+ }
19
+
20
+ print("Running Embedding Generation Test...")
21
+ embeddings = generate_embeddings(sample_data)
22
+
23
+ print("\nResults:")
24
+ for key, vector in embeddings.items():
25
+ vec_len = len(vector)
26
+ print(f"Field: {key:20} | Dimensions: {vec_len} | Sample: {vector[:3]}...")
27
+
28
+ if vec_len != 1024:
29
+ print(f"❌ ERROR: Expected 1024 dimensions, got {vec_len}")
30
+
31
+ if "certifications" not in embeddings:
32
+ print("Field: certifications | Correctly skipped (empty)")
33
+
34
+ print("\nDone.")
backend/src/extraction/__init__.py ADDED
File without changes
backend/src/extraction/fallback_extractor.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def extract_fallback(text: str) -> dict:
4
+ """
5
+ A dumb Regex-based fallback extractor if Gemini fails.
6
+ Extracts basic info like Email, Phone, Links, and keyword-matched Skills.
7
+ """
8
+
9
+ # 1. Email (Basic)
10
+ email_params = r"[\w\.-]+@[\w\.-]+\.\w+"
11
+ email_match = re.search(email_params, text)
12
+ email = email_match.group(0) if email_match else None
13
+
14
+ # 2. Phone (Very Basic - catches 10-12 digit numbers)
15
+ phone_match = re.search(r"(\+?\d{1,3}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text)
16
+ phone = phone_match.group(0) if phone_match else None
17
+
18
+ # 3. Links (LinkedIn / GitHub / Portfolio)
19
+ links = re.findall(r"https?://[^\s]+", text)
20
+ linkedin = next((l for l in links if "linkedin.com" in l), None)
21
+ github = next((l for l in links if "github.com" in l), None)
22
+ portfolio = next((l for l in links if l not in [linkedin, github]), None)
23
+
24
+ # 4. Keyword Matching for Skills (Static List)
25
+ COMMON_SKILLS = [
26
+ "Python", "Java", "JavaScript", "TypeScript", "C++", "C#", "SQL", "NoSQL",
27
+ "React", "Angular", "Vue", "Node.js", "Django", "Flask", "FastAPI",
28
+ "AWS", "Azure", "GCP", "Docker", "Kubernetes", "Git", "CI/CD",
29
+ "Machine Learning", "Deep Learning", "NLP", "Pandas", "NumPy", "TensorFlow", "PyTorch"
30
+ ]
31
+
32
+ found_skills = [skill for skill in COMMON_SKILLS if re.search(r"\b" + re.escape(skill) + r"\b", text, re.IGNORECASE)]
33
+
34
+ # 5. Construct Payload (Matches Schema)
35
+ return {
36
+ "headline": None,
37
+ "summary": text[:500] + "..." if len(text) > 500 else text, # Fallback summary is just first 500 chars
38
+ "skills": found_skills,
39
+ "technical_skills": found_skills, # Duplicate for safety
40
+ "education": [],
41
+ "work_experience": [],
42
+ "certifications": [],
43
+ "languages": [],
44
+ "experience_years": None,
45
+ # Extra fields specific to Supabase Ingest (mapped later)
46
+ # "email": email, # Backend doesn't use extracted email usually (uses auth), but good to have
47
+ "phone": phone,
48
+ "linkedin": linkedin,
49
+ "github": github,
50
+ "portfolio": portfolio
51
+ }
backend/src/extraction/job_extractor.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import time
5
+ from dotenv import load_dotenv
6
+ from typing import Any, Dict, List, Optional
7
+ from google import genai
8
+ from google.genai import types
9
+
10
+ from supabase import create_client
11
+
12
+ # ------------------ CONFIGURATION ------------------
13
+ RAW_DIR = "data/jobs/raw"
14
+ PROCESSED_DIR = "data/jobs/entities"
15
+
16
+ # ------------------ SETUP ------------------
17
+ load_dotenv()
18
+ SUPABASE_URL = os.environ.get("SUPABASE_URL")
19
+ SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
20
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
21
+
22
+ if GEMINI_API_KEY:
23
+ try:
24
+ client = genai.Client(api_key=GEMINI_API_KEY)
25
+ except Exception as e:
26
+ client = None
27
+ print(f"⚠️ Failed to initialize Gemini client: {e}")
28
+ else:
29
+ client = None
30
+ print("⚠️ GEMINI_API_KEY not set; extraction will be disabled.")
31
+
32
+
33
+ def clean_text(text: str) -> str:
34
+ text = re.sub(r"<.*?>", " ", text)
35
+ text = re.sub(r"[^\x00-\x7F]+", " ", text)
36
+ text = re.sub(r"\s+", " ", text)
37
+ return text.strip()
38
+
39
+
40
+ def extract_job_entities_gemini(text: str) -> Dict[str, Any]:
41
+ cleaned_text = clean_text(text)
42
+
43
+ system_prompt = """
44
+ You are an intelligent information extractor specialized in job descriptions.
45
+ Your task is to extract ONLY what is explicitly mentioned and categorize them into the following JSON structure.
46
+
47
+ Output JSON Schema:
48
+ {
49
+ "skills": ["List of soft skills, general competencies..."],
50
+ "technical_skills": ["List of technical skills, programming languages, tools..."],
51
+ "qualification": ["List of educational qualifications..."],
52
+ "work_experience": ["List of work experience requirements..."],
53
+ "preferred_skills": ["List of preferred/nice-to-have skills..."]
54
+ }
55
+
56
+ Rules:
57
+ - Extract exact text as it appears.
58
+ - Do NOT infer or add anything not stated.
59
+ - If no data for a category, return an empty list [].
60
+ - Output MUST be valid JSON.
61
+ """
62
+
63
+ if client is None:
64
+ print("❌ Extraction disabled (no Client).")
65
+ return {}
66
+
67
+ max_retries = 3
68
+ for attempt in range(max_retries):
69
+ try:
70
+ response = client.models.generate_content(
71
+ model="gemini-2.5-flash-lite",
72
+ contents=system_prompt + "\n\nJOB DESCRIPTION:\n" + cleaned_text,
73
+ config=types.GenerateContentConfig(
74
+ temperature=0.1,
75
+ response_mime_type="application/json"
76
+ )
77
+ )
78
+
79
+ extracted_text = response.text.strip()
80
+ # Clean potential markdown fences if present (though response_mime_type usually handles it)
81
+ if extracted_text.startswith("```json"):
82
+ extracted_text = extracted_text[7:]
83
+ if extracted_text.startswith("```"):
84
+ extracted_text = extracted_text[3:]
85
+ if extracted_text.endswith("```"):
86
+ extracted_text = extracted_text[:-3]
87
+
88
+ return json.loads(extracted_text)
89
+
90
+ except Exception as e:
91
+ error_str = str(e)
92
+ if "503" in error_str or "overloaded" in error_str.lower():
93
+ wait_time = 2 ** (attempt + 1)
94
+ print(f"⚠️ Model overloaded. Retrying in {wait_time}s...")
95
+ time.sleep(wait_time)
96
+ else:
97
+ print(f"❌ Gemini Extraction failed: {e}")
98
+ return {}
99
+
100
+ return {}
101
+
102
+
103
+ def upsert_job_entities(sb, job_id: str, experience_level: str, data: Dict[str, Any]) -> None:
104
+ """
105
+ Upserts the extracted entities into the jobs_entities table.
106
+ """
107
+ payload = {
108
+ "job_id": job_id,
109
+ "experience_level": experience_level,
110
+ "skills": data.get("skills", []),
111
+ "technical_skills": data.get("technical_skills", []),
112
+ "qualification": data.get("qualification", []),
113
+ "work_experience": data.get("work_experience", []),
114
+ "preferred_skills": data.get("preferred_skills", []),
115
+ "updated_at": "now()"
116
+ }
117
+
118
+ try:
119
+ sb.table("jobs_entities").upsert(payload).execute()
120
+ print(f"✅ Database updated for Job ID: {job_id}")
121
+ except Exception as e:
122
+ print(f"❌ DB Upsert Error for {job_id}: {e}")
123
+
124
+
125
+ def process_single_job(sb, job_id: str, description: str, experience_level: str = None) -> None:
126
+ """
127
+ Processes a single job: extracts entities and upserts to DB.
128
+ """
129
+ if not description or not description.strip():
130
+ print(f"⚠️ Skipping empty description for job {job_id}")
131
+ return
132
+
133
+ print(f"🔍 Processing Job ID: {job_id}")
134
+
135
+ extracted_data = extract_job_entities_gemini(description)
136
+ if not extracted_data:
137
+ print("⚠️ No entities extracted.")
138
+ return
139
+
140
+ upsert_job_entities(sb, job_id, experience_level, extracted_data)
141
+
142
+
143
+
144
+ def process_jobs_from_db() -> None:
145
+ if not SUPABASE_URL or not SUPABASE_KEY:
146
+ print("⚠️ SUPABASE_URL or SUPABASE_KEY not set; skipping job fetch")
147
+ return
148
+
149
+ try:
150
+ sb = create_client(SUPABASE_URL, SUPABASE_KEY)
151
+ except Exception as e:
152
+ print(f"⚠️ Failed to create Supabase client: {e}")
153
+ return
154
+
155
+ # Fetch jobs from 'jobs' table
156
+ try:
157
+ resp = sb.table("jobs").select("id, description, experience_level").execute()
158
+ except Exception as e:
159
+ print(f"⚠️ Supabase query failed: {e}")
160
+ return
161
+
162
+ data = resp.data if hasattr(resp, "data") else []
163
+ if not data:
164
+ print("⚠️ No job descriptions returned from Supabase.")
165
+ return
166
+
167
+ print(f"found {len(data)} jobs to process.")
168
+
169
+ os.makedirs(PROCESSED_DIR, exist_ok=True)
170
+
171
+ for row in data:
172
+ job_id = row.get("id")
173
+ desc = row.get("description") or ""
174
+
175
+ process_single_job(sb, job_id, desc, experience_level)
176
+
177
+
178
+ if __name__ == '__main__':
179
+ print("🧪 Starting job entity extraction (DB -> Gemini -> DB)...\n")
180
+ process_jobs_from_db()
181
+ print("\n🎯 All jobs processed.")
backend/src/extraction/person_details_extraction_gemini.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.ingestion.parser import parse_file
2
+ import json
3
+ from dotenv import load_dotenv
4
+ from pathlib import Path
5
+ import os
6
+ from google import genai
7
+ import google.genai.types as types
8
+ import time
9
+
10
+ # Load env
11
+ load_dotenv()
12
+
13
+
14
+ client=genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
15
+
16
+ BASE_DIR = Path(__file__).resolve().parents[2]
17
+ RAW_DIR = BASE_DIR / "data" / "resumes" / "raw"
18
+
19
+
20
+ SYSTEM_PROMPT = """
21
+ You are a precise resume entity extraction engine.
22
+
23
+ TASK:
24
+ Extract ONLY the information explicitly present in the resume text.
25
+
26
+ OUTPUT RULES:
27
+ - Output MUST be valid JSON
28
+ - Do NOT hallucinate. If a field is missing, use null.
29
+ -Include empty lists for missing array fields.
30
+ -Include all fields in the output, even if null or empty.
31
+ -Include only the fields specified in the schema below.
32
+ - Do NOT include any explanations, notes, or extra text outside the JSON.
33
+ - Ensure the JSON is properly formatted and parsable.
34
+ - Return "work_experience" as a LIST of objects with fields: role, company, year, duration, description.
35
+ - Calculate "duration" (e.g. "2 years", "6 months") from dates if not explicitly stated.
36
+ - Return "education" as a LIST of objects with fields: course, institution, year.
37
+ - For "skills", "technical_skills", "certifications", and "languages", return LISTS of strings.
38
+ - **CRITICAL**: "languages" refers ONLY to human spoken/written languages (e.g., English, Hindi, Spanish). Programming languages (Python, Java, etc.) MUST go into "technical_skills".
39
+ - For single-value fields like "role", "headline", "summary" and return STRING or null.
40
+ -Calculate experience_years as an INTEGER representing total years of experience, or null if not derivable.
41
+ -only use the field names and structure defined in the schema below.
42
+ -strictLY follow the JSON schema provided.
43
+
44
+ JSON SCHEMA:
45
+ {
46
+ "headline": string | null,
47
+ "summary": string | null,
48
+ "skills": string[],
49
+ "technical_skills": string[], <-- Put Programming Languages HERE
50
+ "education": [
51
+ {
52
+ "course": string | null,
53
+ "institution": string | null,
54
+ "year": string | null
55
+ }
56
+ ],
57
+ "work_experience": [
58
+ {
59
+ "role": string | null,
60
+ "company": string | null,
61
+ "years": string | null,
62
+ "duration": string | null,
63
+ "description": string | null
64
+ }
65
+ ],
66
+ "projects": [
67
+ {
68
+ "title": "string | null",
69
+ "technologies_used": ["string"],
70
+ "description": string | null
71
+ }
72
+ ],
73
+ "projects": [
74
+ {
75
+ "title": "string | null",
76
+ "technologies_used": ["string"],
77
+ "description": "string | null"
78
+ }
79
+ ],
80
+ "certifications": string[],
81
+ "languages": string[],
82
+ "experience_years": integer | null
83
+ }
84
+ "current_position": string | null,
85
+ """
86
+
87
+
88
+ def extract_resume_entities_gemini(text: str) -> dict:
89
+ max_retries = 3
90
+
91
+ for attempt in range(max_retries):
92
+ response = client.models.generate_content(
93
+ model="gemini-2.5-flash-lite",
94
+ contents=SYSTEM_PROMPT + "\n\nRESUME TEXT:\n" + text,
95
+ config=types.GenerateContentConfig(
96
+ temperature=0,
97
+ # 2. CRITICAL: This forces Gemini to return raw JSON without Markdown formatting
98
+ response_mime_type="application/json"
99
+ )
100
+ )
101
+
102
+ try:
103
+ # 3. Clean the response just in case (removes accidental backticks)
104
+ cleaned_text = response.text.strip()
105
+ if cleaned_text.startswith("```json"):
106
+ cleaned_text = cleaned_text[7:]
107
+ if cleaned_text.startswith("```"):
108
+ cleaned_text = cleaned_text[3:]
109
+ if cleaned_text.endswith("```"):
110
+ cleaned_text = cleaned_text[:-3]
111
+
112
+ return json.loads(cleaned_text)
113
+
114
+ except Exception as e:
115
+ # Check if it's the "Overloaded" (503) error
116
+ error_str = str(e)
117
+ if "503" in error_str or "overloaded" in error_str.lower():
118
+ wait_time = 2 ** (attempt + 1) # Exponential backoff: 2s, 4s, 8s...
119
+ print(f"⚠️ Model overloaded. Retrying in {wait_time} seconds... (Attempt {attempt+1}/{max_retries})")
120
+ time.sleep(wait_time)
121
+ else:
122
+ # If it's a different error (like Auth), fail immediately
123
+ print(f"❌ Gemini Error: {e}")
124
+ return {}
125
+
126
+ except json.JSONDecodeError:
127
+ print(f"❌ JSON Decode Error. Raw response was: {response.text}")
128
+ raise ValueError("Gemini returned invalid JSON")
129
+ except Exception as e:
130
+ # Catch model overload or safety filter blocks
131
+ print(f"❌ Gemini Error: {e}")
132
+ return {}
133
+
134
+
135
+ def process_raw_resumes():
136
+ if not RAW_DIR.exists():
137
+ raise FileNotFoundError(f"Directory not found: {RAW_DIR}")
138
+
139
+ for file_path in RAW_DIR.iterdir():
140
+ if file_path.suffix.lower() not in [".pdf", ".docx", ".txt"]:
141
+ continue
142
+
143
+ print(f"\n📄 Processing: {file_path.name}")
144
+
145
+ try:
146
+ text = parse_file(str(file_path))
147
+ entities = extract_resume_entities_gemini(text)
148
+
149
+ print("✅ Extracted entities:")
150
+ print(entities)
151
+
152
+ except Exception as e:
153
+ print(f"❌ Failed for {file_path.name}: {e}")
154
+
155
+ from src.extraction.fallback_extractor import extract_fallback
156
+ from src.preprocess.regex_pii import extract_contact_info_regex, mask_contact_info_regex
157
+ from src.preprocess.anonymizer import extract_name_and_mask
158
+
159
+ def process_single_resume(file_path: str) -> dict:
160
+ """
161
+ Helper function for supabase_ingest.py to process a single downloaded file.
162
+ """
163
+ text = ""
164
+ pii_data = {}
165
+
166
+ try:
167
+ # 1. Convert file path to string just in case
168
+ path_str = str(file_path)
169
+
170
+ # 2. Parse the text from the file (PDF/DOCX)
171
+ raw_text = parse_file(path_str)
172
+
173
+ # 3a. Privacy Step 1: Extract and Mask Contact Info (Regex)
174
+ print("🔒 [1/2] Masking Phone/Email/Links...")
175
+ pii_contact = extract_contact_info_regex(raw_text)
176
+ masked_text_v1 = mask_contact_info_regex(raw_text)
177
+
178
+ # 3b. Privacy Step 2: Extract and Mask Candidate Name (NER)
179
+ print("🔒 [2/2] Masking Names (NER)...")
180
+ ner_result = extract_name_and_mask(masked_text_v1)
181
+ final_masked_text = ner_result["masked_text"]
182
+ candidate_name = ner_result["candidate_name"]
183
+
184
+ # Merge PII Data
185
+ pii_data = pii_contact
186
+ pii_data["full_name"] = candidate_name
187
+
188
+ # Store masked text for error handling usage
189
+ text = final_masked_text
190
+ print(f"DEBUG: Final Masked Text Length: {len(text)}")
191
+ if len(text) < 50:
192
+ print("⚠️ WARNING: Masked text is suspiciously short!")
193
+
194
+ # 4. Send FINAL MASKED text to Gemini
195
+ print("🧠 Sending to Gemini...")
196
+ extracted = extract_resume_entities_gemini(final_masked_text)
197
+
198
+ # 5. Fallback if Gemini failed
199
+ if not extracted:
200
+ print("⚠️ Gemini returned empty. Using Regex Fallback.")
201
+ extracted = extract_fallback(final_masked_text)
202
+
203
+ # 6. Merge PII back into results (whether from Gemini or Fallback)
204
+ extracted.update(pii_data)
205
+
206
+ return extracted
207
+
208
+ except Exception as e:
209
+ print(f"❌ Error processing {file_path}: {e}")
210
+ # Final Fallback attempt
211
+ if text:
212
+ print("⚠️ Exception occurred. Using Regex Fallback on masked text.")
213
+ fallback_data = extract_fallback(text)
214
+ fallback_data.update(pii_data)
215
+ return fallback_data
216
+ return pii_data
217
+
218
+
219
+ if __name__ == "__main__":
220
+ process_raw_resumes()
backend/src/extraction/test_regex.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # Add backend to path so we can import
5
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
6
+
7
+ from backend.src.extraction.fallback_extractor import extract_fallback
8
+
9
+ test_cases = [
10
+ "+91 9876543210", # India
11
+ "+919876543210", # India No Space
12
+ "9876543210", # India Local
13
+ "+1 212-555-0199", # US
14
+ "+44 7911 123456", # UK Mobile
15
+ "+971 50 1234567", # UAE
16
+ "+61 412 345 678", # Australia
17
+ "+49 151 12345678", # Germany
18
+ "+33 6 12 34 56 78", # France
19
+ "+81 90-1234-5678", # Japan
20
+ "Phone: +91 98765-43210", # In text
21
+ "Call me at 123-456-7890", # US Local in text
22
+ "No phone number here"
23
+ ]
24
+
25
+
26
+ print("Testing extract_fallback Phone Extraction:")
27
+ with open("test_output.txt", "w", encoding="utf-8") as f:
28
+ for t in test_cases:
29
+ result = extract_fallback(t)
30
+ output_line = f"'{t}' -> {result.get('phone')}"
31
+ print(output_line)
32
+ f.write(output_line + "\n")
33
+
backend/src/ingestion/__init__.py ADDED
File without changes
backend/src/ingestion/docx_reader.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docx import Document
2
+ from src.preprocess.cleaner import postprocess_extracted_text
3
+
4
+ def parse_docx(path: str) -> str:
5
+ """
6
+ Extract text from DOCX file.
7
+ Returns postprocessed text ready for NER and cleaning.
8
+ """
9
+ doc = Document(path)
10
+ text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
11
+ return postprocess_extracted_text(text)
backend/src/ingestion/parser.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from .pdf_reader import parse_pdf
3
+ from .docx_reader import parse_docx
4
+ from src.preprocess.cleaner import postprocess_extracted_text
5
+ from src.preprocess.cleaner import clean_text
6
+ from src.preprocess.anonymizer import remove_pii
7
+
8
+ def parse_file(path: str) -> str:
9
+ """Detect file type and parse accordingly."""
10
+ ext = os.path.splitext(path)[1].lower()
11
+ if ext == ".pdf":
12
+ text = parse_pdf(path)
13
+ elif ext == ".docx":
14
+ text = parse_docx(path)
15
+ elif ext == ".txt":
16
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
17
+ text = f.read()
18
+ else:
19
+ raise ValueError(f"Unsupported file type: {ext}")
20
+
21
+ return postprocess_extracted_text(remove_pii(clean_text(text)))
backend/src/ingestion/pdf_reader.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pypdf
2
+ import pdfplumber
3
+ from src.preprocess.cleaner import postprocess_extracted_text
4
+
5
+ def parse_pdf(path: str) -> str:
6
+ """
7
+ Extract text from a PDF file.
8
+ Tries pdfplumber first, falls back to pypdf.
9
+ Returns postprocessed text.
10
+ """
11
+ text = ""
12
+
13
+ # --- pdfplumber extraction ---
14
+ try:
15
+ with pdfplumber.open(path) as pdf:
16
+ for page in pdf.pages:
17
+ page_text = page.extract_text()
18
+ if page_text:
19
+ text += page_text + "\n"
20
+ if text.strip():
21
+ return postprocess_extracted_text(text)
22
+ except Exception as e:
23
+ print(f"⚠️ pdfplumber failed for {path}: {e}")
24
+
25
+ # --- fallback to pypdf ---
26
+ try:
27
+ with open(path, "rb") as f:
28
+ reader = pypdf.PdfReader(f)
29
+ for page in reader.pages:
30
+ page_text = page.extract_text()
31
+ if page_text:
32
+ text += page_text + "\n"
33
+ if text.strip():
34
+ return postprocess_extracted_text(text)
35
+ except Exception as e:
36
+ print(f"❌ pypdf also failed for {path}: {e}")
37
+
38
+ raise ValueError(f"Unable to extract text from PDF: {path}")
backend/src/matching/__init__.py ADDED
File without changes
backend/src/matching/similarity.py ADDED
File without changes