Spaces:
Sleeping
Sleeping
Muhammed Sameer commited on
Commit ·
ea9ca44
0
Parent(s):
Initial commit - Iris Full (under development)
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +24 -0
- README.md +12 -0
- Supabase/.temp/cli-latest +1 -0
- Supabase/config.toml +66 -0
- Supabase/functions/_shared/cors.ts +4 -0
- Supabase/functions/generate-verification-token/.npmrc +3 -0
- Supabase/functions/generate-verification-token/deno.json +3 -0
- Supabase/functions/generate-verification-token/index.ts +60 -0
- Supabase/functions/initiate-admin-transfer/.npmrc +3 -0
- Supabase/functions/initiate-admin-transfer/deno.json +3 -0
- Supabase/functions/initiate-admin-transfer/index.ts +80 -0
- Supabase/functions/invite-first-admin/.npmrc +3 -0
- Supabase/functions/invite-first-admin/deno.json +3 -0
- Supabase/functions/invite-first-admin/index.ts +24 -0
- Supabase/functions/otp/.npmrc +3 -0
- Supabase/functions/otp/deno.json +3 -0
- Supabase/functions/otp/index.ts +137 -0
- Supabase/functions/send-interview-email/.npmrc +3 -0
- Supabase/functions/send-interview-email/deno.json +3 -0
- Supabase/functions/send-interview-email/index.ts +51 -0
- Supabase/functions/verify-domain/.npmrc +3 -0
- Supabase/functions/verify-domain/deno.json +3 -0
- Supabase/functions/verify-domain/index.ts +32 -0
- backend/.env +12 -0
- backend/.gitignore +26 -0
- backend/add_experience_to_embeddings.sql +5 -0
- backend/add_projects_to_profiles.sql +5 -0
- backend/api.py +157 -0
- backend/create_profile_embeddings.sql +32 -0
- backend/debug_payload.json +69 -0
- backend/debug_resume.txt +6 -0
- backend/requirements.txt +28 -0
- backend/src/__init__.py +0 -0
- backend/src/embeddings/__init__.py +0 -0
- backend/src/embeddings/debug_embedding_storage.py +62 -0
- backend/src/embeddings/job_embed.py +108 -0
- backend/src/embeddings/local_embedder.py +137 -0
- backend/src/embeddings/process_all_profiles.py +46 -0
- backend/src/embeddings/test_embedder.py +34 -0
- backend/src/extraction/__init__.py +0 -0
- backend/src/extraction/fallback_extractor.py +51 -0
- backend/src/extraction/job_extractor.py +181 -0
- backend/src/extraction/person_details_extraction_gemini.py +220 -0
- backend/src/extraction/test_regex.py +33 -0
- backend/src/ingestion/__init__.py +0 -0
- backend/src/ingestion/docx_reader.py +11 -0
- backend/src/ingestion/parser.py +21 -0
- backend/src/ingestion/pdf_reader.py +38 -0
- backend/src/matching/__init__.py +0 -0
- backend/src/matching/similarity.py +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Logs
|
| 2 |
+
logs
|
| 3 |
+
*.log
|
| 4 |
+
npm-debug.log*
|
| 5 |
+
yarn-debug.log*
|
| 6 |
+
yarn-error.log*
|
| 7 |
+
pnpm-debug.log*
|
| 8 |
+
lerna-debug.log*
|
| 9 |
+
|
| 10 |
+
node_modules
|
| 11 |
+
dist
|
| 12 |
+
dist-ssr
|
| 13 |
+
*.local
|
| 14 |
+
|
| 15 |
+
# Editor directories and files
|
| 16 |
+
.vscode/*
|
| 17 |
+
!.vscode/extensions.json
|
| 18 |
+
.idea
|
| 19 |
+
.DS_Store
|
| 20 |
+
*.suo
|
| 21 |
+
*.ntvs*
|
| 22 |
+
*.njsproj
|
| 23 |
+
*.sln
|
| 24 |
+
*.sw?
|
README.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# React + Vite
|
| 2 |
+
|
| 3 |
+
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
|
| 4 |
+
|
| 5 |
+
Currently, two official plugins are available:
|
| 6 |
+
|
| 7 |
+
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) for Fast Refresh
|
| 8 |
+
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
|
| 9 |
+
|
| 10 |
+
## Expanding the ESLint configuration
|
| 11 |
+
|
| 12 |
+
If you are developing a production application, we recommend using TypeScript with type-aware lint rules enabled. Check out the [TS template](https://github.com/vitejs/vite/tree/main/packages/create-vite/template-react-ts) for information on how to integrate TypeScript and [`typescript-eslint`](https://typescript-eslint.io) in your project.
|
Supabase/.temp/cli-latest
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
v2.67.1
|
Supabase/config.toml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
[functions.verify-domain]
|
| 3 |
+
enabled = true
|
| 4 |
+
verify_jwt = true
|
| 5 |
+
import_map = "./functions/verify-domain/deno.json"
|
| 6 |
+
# Uncomment to specify a custom file path to the entrypoint.
|
| 7 |
+
# Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
|
| 8 |
+
entrypoint = "./functions/verify-domain/index.ts"
|
| 9 |
+
# Specifies static files to be bundled with the function. Supports glob patterns.
|
| 10 |
+
# For example, if you want to serve static HTML pages in your function:
|
| 11 |
+
# static_files = [ "./functions/verify-domain/*.html" ]
|
| 12 |
+
|
| 13 |
+
[functions.invite-first-admin]
|
| 14 |
+
enabled = true
|
| 15 |
+
verify_jwt = true
|
| 16 |
+
import_map = "./functions/invite-first-admin/deno.json"
|
| 17 |
+
# Uncomment to specify a custom file path to the entrypoint.
|
| 18 |
+
# Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
|
| 19 |
+
entrypoint = "./functions/invite-first-admin/index.ts"
|
| 20 |
+
# Specifies static files to be bundled with the function. Supports glob patterns.
|
| 21 |
+
# For example, if you want to serve static HTML pages in your function:
|
| 22 |
+
# static_files = [ "./functions/invite-first-admin/*.html" ]
|
| 23 |
+
|
| 24 |
+
[functions.generate-verification-token]
|
| 25 |
+
enabled = true
|
| 26 |
+
verify_jwt = true
|
| 27 |
+
import_map = "./functions/generate-verification-token/deno.json"
|
| 28 |
+
# Uncomment to specify a custom file path to the entrypoint.
|
| 29 |
+
# Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
|
| 30 |
+
entrypoint = "./functions/generate-verification-token/index.ts"
|
| 31 |
+
# Specifies static files to be bundled with the function. Supports glob patterns.
|
| 32 |
+
# For example, if you want to serve static HTML pages in your function:
|
| 33 |
+
# static_files = [ "./functions/generate-verification-token/*.html" ]
|
| 34 |
+
|
| 35 |
+
[functions.initiate-admin-transfer]
|
| 36 |
+
enabled = true
|
| 37 |
+
verify_jwt = true
|
| 38 |
+
import_map = "./functions/initiate-admin-transfer/deno.json"
|
| 39 |
+
# Uncomment to specify a custom file path to the entrypoint.
|
| 40 |
+
# Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
|
| 41 |
+
entrypoint = "./functions/initiate-admin-transfer/index.ts"
|
| 42 |
+
# Specifies static files to be bundled with the function. Supports glob patterns.
|
| 43 |
+
# For example, if you want to serve static HTML pages in your function:
|
| 44 |
+
# static_files = [ "./functions/initiate-admin-transfer/*.html" ]
|
| 45 |
+
|
| 46 |
+
[functions.otp]
|
| 47 |
+
enabled = true
|
| 48 |
+
verify_jwt = true
|
| 49 |
+
import_map = "./functions/otp/deno.json"
|
| 50 |
+
# Uncomment to specify a custom file path to the entrypoint.
|
| 51 |
+
# Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
|
| 52 |
+
entrypoint = "./functions/otp/index.ts"
|
| 53 |
+
# Specifies static files to be bundled with the function. Supports glob patterns.
|
| 54 |
+
# For example, if you want to serve static HTML pages in your function:
|
| 55 |
+
# static_files = [ "./functions/otp/*.html" ]
|
| 56 |
+
|
| 57 |
+
[functions.send-interview-email]
|
| 58 |
+
enabled = true
|
| 59 |
+
verify_jwt = true
|
| 60 |
+
import_map = "./functions/send-interview-email/deno.json"
|
| 61 |
+
# Uncomment to specify a custom file path to the entrypoint.
|
| 62 |
+
# Supported file extensions are: .ts, .js, .mjs, .jsx, .tsx
|
| 63 |
+
entrypoint = "./functions/send-interview-email/index.ts"
|
| 64 |
+
# Specifies static files to be bundled with the function. Supports glob patterns.
|
| 65 |
+
# For example, if you want to serve static HTML pages in your function:
|
| 66 |
+
# static_files = [ "./functions/send-interview-email/*.html" ]
|
Supabase/functions/_shared/cors.ts
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export const corsHeaders = {
|
| 2 |
+
'Access-Control-Allow-Origin': '*',
|
| 3 |
+
'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
|
| 4 |
+
}
|
Supabase/functions/generate-verification-token/.npmrc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for private npm package dependencies
|
| 2 |
+
# For more information on using private registries with Edge Functions, see:
|
| 3 |
+
# https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
|
Supabase/functions/generate-verification-token/deno.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"imports": {}
|
| 3 |
+
}
|
Supabase/functions/generate-verification-token/index.ts
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// supabase/functions/generate-verification-token/index.ts
|
| 2 |
+
|
| 3 |
+
import { serve } from 'https://deno.land/std@0.177.0/http/server.ts'
|
| 4 |
+
import { createClient } from 'https://esm.sh/@supabase/supabase-js@2.39.8'
|
| 5 |
+
import { corsHeaders } from '../_shared/cors.ts'
|
| 6 |
+
|
| 7 |
+
serve(async (req) => {
|
| 8 |
+
// This is the crucial block that handles the browser's preflight check
|
| 9 |
+
if (req.method === 'OPTIONS') {
|
| 10 |
+
return new Response('ok', { headers: corsHeaders })
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
try {
|
| 14 |
+
const { email } = await req.json();
|
| 15 |
+
const domain = email.split('@')[1];
|
| 16 |
+
if (!domain) {
|
| 17 |
+
throw new Error("Invalid email format.");
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
const supabaseAdmin = createClient(
|
| 21 |
+
Deno.env.get('SUPABASE_URL')!,
|
| 22 |
+
Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!
|
| 23 |
+
);
|
| 24 |
+
|
| 25 |
+
const { data: blockedDomain } = await supabaseAdmin
|
| 26 |
+
.from('blocked_domains')
|
| 27 |
+
.select('domain')
|
| 28 |
+
.eq('domain', domain)
|
| 29 |
+
.single();
|
| 30 |
+
|
| 31 |
+
if (blockedDomain) {
|
| 32 |
+
throw new Error("Please use a business email. Free email providers are not allowed.");
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
const { data, error } = await supabaseAdmin
|
| 36 |
+
.from('organizations')
|
| 37 |
+
.insert({
|
| 38 |
+
name: domain,
|
| 39 |
+
verified_domain: domain,
|
| 40 |
+
})
|
| 41 |
+
.select('verification_token')
|
| 42 |
+
.single();
|
| 43 |
+
|
| 44 |
+
if (error) throw error;
|
| 45 |
+
|
| 46 |
+
return new Response(JSON.stringify({
|
| 47 |
+
verification_token: data.verification_token,
|
| 48 |
+
domain: domain
|
| 49 |
+
}), {
|
| 50 |
+
headers: { ...corsHeaders, 'Content-Type': 'application/json' },
|
| 51 |
+
status: 200,
|
| 52 |
+
});
|
| 53 |
+
|
| 54 |
+
} catch (error) {
|
| 55 |
+
return new Response(JSON.stringify({ error: error.message }), {
|
| 56 |
+
headers: { ...corsHeaders, 'Content-Type': 'application/json' },
|
| 57 |
+
status: 400,
|
| 58 |
+
});
|
| 59 |
+
}
|
| 60 |
+
})
|
Supabase/functions/initiate-admin-transfer/.npmrc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for private npm package dependencies
|
| 2 |
+
# For more information on using private registries with Edge Functions, see:
|
| 3 |
+
# https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
|
Supabase/functions/initiate-admin-transfer/deno.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"imports": {}
|
| 3 |
+
}
|
Supabase/functions/initiate-admin-transfer/index.ts
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { createClient } from 'https://esm.sh/@supabase/supabase-js@2'
|
| 2 |
+
import { v4 as uuidv4 } from 'https://deno.land/std@0.106.0/uuid/mod.ts';
|
| 3 |
+
|
| 4 |
+
const corsHeaders = {
|
| 5 |
+
'Access-Control-Allow-Origin': '*',
|
| 6 |
+
'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
Deno.serve(async (req) => {
|
| 10 |
+
if (req.method === 'OPTIONS') {
|
| 11 |
+
return new Response('ok', { headers: corsHeaders })
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
try {
|
| 15 |
+
const { newAdminEmail } = await req.json();
|
| 16 |
+
if (!newAdminEmail) {
|
| 17 |
+
throw new Error("New admin's email is required.");
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
// Create an admin client to bypass RLS
|
| 21 |
+
const supabaseAdmin = createClient(
|
| 22 |
+
Deno.env.get('SUPABASE_URL')!,
|
| 23 |
+
Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!
|
| 24 |
+
);
|
| 25 |
+
|
| 26 |
+
// Get the current user from the request's auth token
|
| 27 |
+
const authHeader = req.headers.get('Authorization')!;
|
| 28 |
+
const jwt = authHeader.replace('Bearer ', '');
|
| 29 |
+
const { data: { user } } = await supabaseAdmin.auth.getUser(jwt);
|
| 30 |
+
if (!user) {
|
| 31 |
+
throw new Error("Could not identify the current user.");
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
// 1. Generate a secure, unique token for the transfer
|
| 35 |
+
const transferToken = uuidv4();
|
| 36 |
+
const expiryDate = new Date();
|
| 37 |
+
expiryDate.setHours(expiryDate.getHours() + 24); // Token is valid for 24 hours
|
| 38 |
+
|
| 39 |
+
// 2. Store the token and link it to the current user's company
|
| 40 |
+
// This assumes the 'companies' table has 'admin_transfer_token' and 'admin_transfer_expires_at' columns
|
| 41 |
+
const { data: profile, error: profileError } = await supabaseAdmin
|
| 42 |
+
.from('profiles').select('company_id').eq('id', user.id).single();
|
| 43 |
+
if (profileError || !profile) throw new Error("Could not find the user's company.");
|
| 44 |
+
|
| 45 |
+
const { error: updateError } = await supabaseAdmin
|
| 46 |
+
.from('companies')
|
| 47 |
+
.update({
|
| 48 |
+
admin_transfer_token: transferToken,
|
| 49 |
+
admin_transfer_expires_at: expiryDate.toISOString(),
|
| 50 |
+
})
|
| 51 |
+
.eq('id', profile.company_id);
|
| 52 |
+
if (updateError) throw new Error("Failed to store the transfer token.");
|
| 53 |
+
|
| 54 |
+
// 3. Send a magic link email to the new admin
|
| 55 |
+
// This link should point to a page in your app that handles the token verification
|
| 56 |
+
const transferUrl = `${Deno.env.get('SITE_URL')}/accept-admin-transfer?token=${transferToken}`;
|
| 57 |
+
|
| 58 |
+
const { error: emailError } = await supabaseAdmin.auth.admin.generateLink({
|
| 59 |
+
type: 'magiclink',
|
| 60 |
+
email: newAdminEmail,
|
| 61 |
+
options: {
|
| 62 |
+
redirectTo: transferUrl
|
| 63 |
+
}
|
| 64 |
+
});
|
| 65 |
+
|
| 66 |
+
if (emailError) {
|
| 67 |
+
throw new Error("Could not send invitation email.");
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
return new Response(JSON.stringify({ success: true, message: "Transfer invitation sent." }), {
|
| 71 |
+
headers: { ...corsHeaders, 'Content-Type': 'application/json' },
|
| 72 |
+
status: 200,
|
| 73 |
+
})
|
| 74 |
+
} catch (err) {
|
| 75 |
+
return new Response(JSON.stringify({ error: err.message }), {
|
| 76 |
+
headers: { ...corsHeaders, 'Content-Type': 'application/json' },
|
| 77 |
+
status: 400,
|
| 78 |
+
})
|
| 79 |
+
}
|
| 80 |
+
})
|
Supabase/functions/invite-first-admin/.npmrc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for private npm package dependencies
|
| 2 |
+
# For more information on using private registries with Edge Functions, see:
|
| 3 |
+
# https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
|
Supabase/functions/invite-first-admin/deno.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"imports": {}
|
| 3 |
+
}
|
Supabase/functions/invite-first-admin/index.ts
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { serve } from 'https://deno.land/std@0.177.0/http/server.ts'
|
| 2 |
+
import { createClient } from 'https://esm.sh/@supabase/supabase-js@2.39.8'
|
| 3 |
+
import { corsHeaders } from '../_shared/cors.ts'
|
| 4 |
+
|
| 5 |
+
serve(async (req) => {
|
| 6 |
+
if (req.method === 'OPTIONS') { return new Response('ok', { headers: corsHeaders }) }
|
| 7 |
+
try {
|
| 8 |
+
const { adminEmail, domain } = await req.json();
|
| 9 |
+
if (!adminEmail || !domain) throw new Error("Admin email and domain are required.");
|
| 10 |
+
if (adminEmail.split('@')[1] !== domain) throw new Error("Admin email must belong to the verified domain.");
|
| 11 |
+
|
| 12 |
+
const supabaseAdmin = createClient(Deno.env.get('SUPABASE_URL')!, Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!);
|
| 13 |
+
|
| 14 |
+
const { data: orgData, error: orgError } = await supabaseAdmin.from('organizations').select('id').eq('verified_domain', domain).eq('is_verified', true).single();
|
| 15 |
+
if (orgError || !orgData) throw new Error("Cannot send invite: Organization is not verified.");
|
| 16 |
+
|
| 17 |
+
const { error: inviteError } = await supabaseAdmin.auth.admin.inviteUserByEmail(adminEmail);
|
| 18 |
+
if (inviteError) throw inviteError;
|
| 19 |
+
|
| 20 |
+
return new Response(JSON.stringify({ success: true, message: `Invitation sent to ${adminEmail}.` }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' } });
|
| 21 |
+
} catch (error) {
|
| 22 |
+
return new Response(JSON.stringify({ error: error.message }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 400 });
|
| 23 |
+
}
|
| 24 |
+
})
|
Supabase/functions/otp/.npmrc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for private npm package dependencies
|
| 2 |
+
# For more information on using private registries with Edge Functions, see:
|
| 3 |
+
# https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
|
Supabase/functions/otp/deno.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"imports": {}
|
| 3 |
+
}
|
Supabase/functions/otp/index.ts
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { serve } from "https://deno.land/std@0.168.0/http/server.ts";
|
| 2 |
+
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
|
| 3 |
+
|
| 4 |
+
const corsHeaders = {
|
| 5 |
+
'Access-Control-Allow-Origin': '*',
|
| 6 |
+
'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
|
| 7 |
+
};
|
| 8 |
+
|
| 9 |
+
serve(async (req) => {
|
| 10 |
+
if (req.method === 'OPTIONS') {
|
| 11 |
+
return new Response('ok', { headers: corsHeaders });
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
try {
|
| 15 |
+
// 1. Init Supabase Clients
|
| 16 |
+
const supabaseAdmin = createClient(
|
| 17 |
+
Deno.env.get('SUPABASE_URL') ?? '',
|
| 18 |
+
Deno.env.get('SUPABASE_SERVICE_ROLE_KEY') ?? ''
|
| 19 |
+
);
|
| 20 |
+
|
| 21 |
+
const authHeader = req.headers.get('Authorization')!;
|
| 22 |
+
const supabaseClient = createClient(
|
| 23 |
+
Deno.env.get('SUPABASE_URL') ?? '',
|
| 24 |
+
Deno.env.get('SUPABASE_ANON_KEY') ?? '',
|
| 25 |
+
{ global: { headers: { Authorization: authHeader } } }
|
| 26 |
+
);
|
| 27 |
+
|
| 28 |
+
// 2. Auth Check
|
| 29 |
+
const { data: { user }, error: authError } = await supabaseClient.auth.getUser();
|
| 30 |
+
if (authError || !user) throw new Error("Unauthorized");
|
| 31 |
+
|
| 32 |
+
const { action, userCode } = await req.json();
|
| 33 |
+
|
| 34 |
+
// ==========================================
|
| 35 |
+
// ACTION: SEND SMS (VIA TWILIO)
|
| 36 |
+
// ==========================================
|
| 37 |
+
if (action === 'send') {
|
| 38 |
+
const { data: profile } = await supabaseAdmin
|
| 39 |
+
.from('profiles')
|
| 40 |
+
.select('phone')
|
| 41 |
+
.eq('id', user.id)
|
| 42 |
+
.single();
|
| 43 |
+
|
| 44 |
+
if (!profile?.phone) throw new Error("No phone number found in profile.");
|
| 45 |
+
|
| 46 |
+
const phone = profile.phone;
|
| 47 |
+
const otp = Math.floor(100000 + Math.random() * 900000).toString();
|
| 48 |
+
const expiresAt = new Date(Date.now() + 5 * 60 * 1000).toISOString();
|
| 49 |
+
|
| 50 |
+
// Upsert OTP to DB
|
| 51 |
+
const { error: upsertError } = await supabaseAdmin
|
| 52 |
+
.from('otp_verifications')
|
| 53 |
+
.upsert({ phone, otp_code: otp, expires_at: expiresAt, attempts_count: 0 }, { onConflict: 'phone' });
|
| 54 |
+
|
| 55 |
+
if (upsertError) throw upsertError;
|
| 56 |
+
|
| 57 |
+
// --- TWILIO SENDING LOGIC STARTS HERE ---
|
| 58 |
+
const accountSid = Deno.env.get("TWILIO_ACCOUNT_SID");
|
| 59 |
+
const authToken = Deno.env.get("TWILIO_AUTH_TOKEN");
|
| 60 |
+
const fromNumber = Deno.env.get("TWILIO_PHONE_NUMBER");
|
| 61 |
+
|
| 62 |
+
if (!accountSid || !authToken || !fromNumber) {
|
| 63 |
+
throw new Error("Twilio secrets are missing in Supabase.");
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
// Format parameters for Twilio API
|
| 67 |
+
const params = new URLSearchParams();
|
| 68 |
+
params.append('To', phone);
|
| 69 |
+
params.append('From', fromNumber);
|
| 70 |
+
params.append('Body', `Your Verification Code is: ${otp}`);
|
| 71 |
+
|
| 72 |
+
console.log(`Sending SMS to ${phone}...`);
|
| 73 |
+
|
| 74 |
+
const twilioRes = await fetch(
|
| 75 |
+
`https://api.twilio.com/2010-04-01/Accounts/${accountSid}/Messages.json`,
|
| 76 |
+
{
|
| 77 |
+
method: "POST",
|
| 78 |
+
headers: {
|
| 79 |
+
"Authorization": `Basic ${btoa(`${accountSid}:${authToken}`)}`,
|
| 80 |
+
"Content-Type": "application/x-www-form-urlencoded",
|
| 81 |
+
},
|
| 82 |
+
body: params,
|
| 83 |
+
}
|
| 84 |
+
);
|
| 85 |
+
|
| 86 |
+
if (!twilioRes.ok) {
|
| 87 |
+
const errorText = await twilioRes.text();
|
| 88 |
+
console.error("Twilio Error:", errorText);
|
| 89 |
+
throw new Error("Failed to send SMS. Check server logs.");
|
| 90 |
+
}
|
| 91 |
+
// --- TWILIO LOGIC ENDS HERE ---
|
| 92 |
+
|
| 93 |
+
return new Response(
|
| 94 |
+
JSON.stringify({ message: "OTP sent successfully" }),
|
| 95 |
+
{ headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 200 }
|
| 96 |
+
);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
// ==========================================
|
| 100 |
+
// ACTION: VERIFY
|
| 101 |
+
// ==========================================
|
| 102 |
+
if (action === 'verify') {
|
| 103 |
+
if (!userCode) throw new Error("Missing OTP code");
|
| 104 |
+
|
| 105 |
+
const { data: profile } = await supabaseAdmin.from('profiles').select('phone').eq('id', user.id).single();
|
| 106 |
+
const phone = profile?.phone;
|
| 107 |
+
|
| 108 |
+
const { data: record } = await supabaseAdmin.from('otp_verifications').select('*').eq('phone', phone).single();
|
| 109 |
+
|
| 110 |
+
if (!record) throw new Error("Invalid or expired OTP.");
|
| 111 |
+
if (new Date() > new Date(record.expires_at)) throw new Error("OTP has expired.");
|
| 112 |
+
if (record.attempts_count >= 3) throw new Error("Too many attempts.");
|
| 113 |
+
|
| 114 |
+
if (record.otp_code !== userCode) {
|
| 115 |
+
await supabaseAdmin.from('otp_verifications').update({ attempts_count: record.attempts_count + 1 }).eq('phone', phone);
|
| 116 |
+
throw new Error("Incorrect OTP code.");
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
// Success
|
| 120 |
+
await supabaseAdmin.from('profiles').update({ is_phone_verified: true }).eq('id', user.id);
|
| 121 |
+
await supabaseAdmin.from('otp_verifications').delete().eq('phone', phone);
|
| 122 |
+
|
| 123 |
+
return new Response(
|
| 124 |
+
JSON.stringify({ message: "Phone verified successfully!" }),
|
| 125 |
+
{ headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 200 }
|
| 126 |
+
);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
return new Response(JSON.stringify({ error: "Invalid Action" }), { status: 400, headers: corsHeaders });
|
| 130 |
+
|
| 131 |
+
} catch (error) {
|
| 132 |
+
return new Response(
|
| 133 |
+
JSON.stringify({ error: error.message }),
|
| 134 |
+
{ headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 400 }
|
| 135 |
+
);
|
| 136 |
+
}
|
| 137 |
+
});
|
Supabase/functions/send-interview-email/.npmrc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for private npm package dependencies
|
| 2 |
+
# For more information on using private registries with Edge Functions, see:
|
| 3 |
+
# https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
|
Supabase/functions/send-interview-email/deno.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"imports": {}
|
| 3 |
+
}
|
Supabase/functions/send-interview-email/index.ts
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// supabase/functions/send-interview-email/index.ts
|
| 2 |
+
import { serve } from "https://deno.land/std@0.168.0/http/server.ts";
|
| 3 |
+
import { Resend } from "npm:resend";
|
| 4 |
+
|
| 5 |
+
const resend = new Resend(Deno.env.get("RESEND_API_KEY"));
|
| 6 |
+
|
| 7 |
+
const corsHeaders = {
|
| 8 |
+
"Access-Control-Allow-Origin": "*",
|
| 9 |
+
"Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
|
| 10 |
+
};
|
| 11 |
+
|
| 12 |
+
serve(async (req) => {
|
| 13 |
+
if (req.method === "OPTIONS") return new Response("ok", { headers: corsHeaders });
|
| 14 |
+
|
| 15 |
+
try {
|
| 16 |
+
const { candidateName, candidateEmail, date, time, meetingLink, role } = await req.json();
|
| 17 |
+
|
| 18 |
+
const { data, error } = await resend.emails.send({
|
| 19 |
+
from: "Acme HR <onboarding@resend.dev>", // Change this to your verified domain if you have one
|
| 20 |
+
to: [candidateEmail],
|
| 21 |
+
subject: `Interview Invitation: ${role}`,
|
| 22 |
+
html: `
|
| 23 |
+
<div style="font-family: sans-serif; padding: 20px;">
|
| 24 |
+
<h1>Hi ${candidateName},</h1>
|
| 25 |
+
<p>We are pleased to invite you to a <strong>Technical Interview</strong> for the <strong>${role}</strong> position.</p>
|
| 26 |
+
|
| 27 |
+
<div style="background: #f3f4f6; padding: 15px; border-radius: 8px; margin: 20px 0;">
|
| 28 |
+
<p style="margin: 5px 0;"><strong>📅 Date:</strong> ${date}</p>
|
| 29 |
+
<p style="margin: 5px 0;"><strong>⏰ Time:</strong> ${time}</p>
|
| 30 |
+
<p style="margin: 5px 0;"><strong>🔗 Link:</strong> <a href="${meetingLink}">${meetingLink}</a></p>
|
| 31 |
+
</div>
|
| 32 |
+
|
| 33 |
+
<p>Please join 5 minutes early.</p>
|
| 34 |
+
<p>Best,<br>Hiring Team</p>
|
| 35 |
+
</div>
|
| 36 |
+
`,
|
| 37 |
+
});
|
| 38 |
+
|
| 39 |
+
if (error) throw error;
|
| 40 |
+
|
| 41 |
+
return new Response(JSON.stringify(data), {
|
| 42 |
+
headers: { ...corsHeaders, "Content-Type": "application/json" },
|
| 43 |
+
status: 200,
|
| 44 |
+
});
|
| 45 |
+
} catch (error) {
|
| 46 |
+
return new Response(JSON.stringify({ error: error.message }), {
|
| 47 |
+
headers: { ...corsHeaders, "Content-Type": "application/json" },
|
| 48 |
+
status: 500,
|
| 49 |
+
});
|
| 50 |
+
}
|
| 51 |
+
});
|
Supabase/functions/verify-domain/.npmrc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for private npm package dependencies
|
| 2 |
+
# For more information on using private registries with Edge Functions, see:
|
| 3 |
+
# https://supabase.com/docs/guides/functions/import-maps#importing-from-private-registries
|
Supabase/functions/verify-domain/deno.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"imports": {}
|
| 3 |
+
}
|
Supabase/functions/verify-domain/index.ts
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { serve } from 'https://deno.land/std@0.177.0/http/server.ts'
|
| 2 |
+
import { createClient } from 'https://esm.sh/@supabase/supabase-js@2.39.8'
|
| 3 |
+
import { corsHeaders } from '../_shared/cors.ts'
|
| 4 |
+
|
| 5 |
+
serve(async (req) => {
|
| 6 |
+
if (req.method === 'OPTIONS') { return new Response('ok', { headers: corsHeaders }) }
|
| 7 |
+
try {
|
| 8 |
+
const { domain } = await req.json();
|
| 9 |
+
if (!domain) throw new Error("Domain is required.");
|
| 10 |
+
|
| 11 |
+
const supabaseAdmin = createClient(Deno.env.get('SUPABASE_URL')!, Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')!);
|
| 12 |
+
|
| 13 |
+
const { data: orgData, error: orgError } = await supabaseAdmin.from('organizations').select('verification_token, is_verified').eq('verified_domain', domain).single();
|
| 14 |
+
if (orgError) throw new Error("Could not find an organization for this domain.");
|
| 15 |
+
if (orgData.is_verified) return new Response(JSON.stringify({ success: true, message: 'Domain is already verified.' }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' } });
|
| 16 |
+
|
| 17 |
+
const expectedToken = `${orgData.verification_token}`;
|
| 18 |
+
let isVerified = false;
|
| 19 |
+
const txtRecords = await Deno.resolveDns(domain, "TXT");
|
| 20 |
+
|
| 21 |
+
for (const record of txtRecords) { if (record.includes(expectedToken)) { isVerified = true; break; } }
|
| 22 |
+
|
| 23 |
+
if (isVerified) {
|
| 24 |
+
await supabaseAdmin.from('organizations').update({ is_verified: true, verification_token: null // <-- The added line }).eq('verified_domain', domain);
|
| 25 |
+
return new Response(JSON.stringify({ success: true, message: 'Domain verified!' }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' } });
|
| 26 |
+
} else {
|
| 27 |
+
throw new Error("Verification failed. TXT record not found or has not propagated yet.");
|
| 28 |
+
}
|
| 29 |
+
} catch (error) {
|
| 30 |
+
return new Response(JSON.stringify({ error: error.message }), { headers: { ...corsHeaders, 'Content-Type': 'application/json' }, status: 400 });
|
| 31 |
+
}
|
| 32 |
+
})
|
backend/.env
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
OPENAI_API_KEY=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
|
| 2 |
+
GEMINI_API_KEY=AIzaSyA0lgoagdthXdxR_nMhqI5FSu5crY0gd7Y
|
| 3 |
+
# Supabase configuration (fill these with your project values)
|
| 4 |
+
# SUPABASE_URL: e.g. https://your-project.supabase.co
|
| 5 |
+
# SUPABASE_KEY: service role key or anon key (prefer service role for server-side ops)
|
| 6 |
+
SUPABASE_URL=https://obhychdzwbytlzwrjrbl.supabase.co
|
| 7 |
+
SUPABASE_SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im9iaHljaGR6d2J5dGx6d3JqcmJsIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTc1ODg4MTA1OSwiZXhwIjoyMDc0NDU3MDU5fQ.8_-9OY1Ae89TOKMd8foK3ojilBhrHWhg_w2cz-YWsCA
|
| 8 |
+
# Optional: storage bucket and prefix to fetch resumes from
|
| 9 |
+
SUPABASE_BUCKET=resume
|
| 10 |
+
SUPABASE_PREFIX=""
|
| 11 |
+
# Set to 1/true/yes to enable automatic fetching from Supabase when running `run_pipeline.py`
|
| 12 |
+
USE_SUPABASE_RAW=1
|
backend/.gitignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Secrets
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
# Python
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.pyc
|
| 7 |
+
*.pyo
|
| 8 |
+
*.pyd
|
| 9 |
+
.Python
|
| 10 |
+
env/
|
| 11 |
+
venv/
|
| 12 |
+
.venv/
|
| 13 |
+
pip-log.txt
|
| 14 |
+
pip-delete-this-directory.txt
|
| 15 |
+
.tox/
|
| 16 |
+
.coverage
|
| 17 |
+
.coverage.*
|
| 18 |
+
.cache
|
| 19 |
+
nosetests.xml
|
| 20 |
+
coverage.xml
|
| 21 |
+
*.cover
|
| 22 |
+
*.log
|
| 23 |
+
.pytest_cache/
|
| 24 |
+
|
| 25 |
+
# Data
|
| 26 |
+
data/
|
backend/add_experience_to_embeddings.sql
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
-- Add the missing 'experience' column to profile_embeddings
|
| 3 |
+
-- BAAI/bge-m3 uses 1024 dimensions
|
| 4 |
+
alter table profile_embeddings
|
| 5 |
+
add column if not exists experience vector(1024);
|
backend/add_projects_to_profiles.sql
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
-- Add 'projects' column to profiles table to store extracted project details
|
| 3 |
+
-- It will store a JSONB array of objects: [{ title, technologies_used, description }]
|
| 4 |
+
alter table profiles
|
| 5 |
+
add column if not exists projects jsonb default '[]'::jsonb;
|
backend/api.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api.py
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
# Load env BEFORE importing modules that depend on it
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
from fastapi import FastAPI, HTTPException, UploadFile, Form, File
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
from supabase import create_client
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
from supabase_ingest import process_resume
|
| 13 |
+
from src.extraction.job_extractor import process_single_job
|
| 14 |
+
from src.services.ats_service import analyze_ats_compatibility
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
app = FastAPI()
|
| 18 |
+
|
| 19 |
+
app.add_middleware(
|
| 20 |
+
CORSMiddleware,
|
| 21 |
+
allow_origins=["*"], # Allow all origins for dev; restrict in prod
|
| 22 |
+
allow_credentials=True,
|
| 23 |
+
allow_methods=["*"],
|
| 24 |
+
allow_headers=["*"],
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Setup Supabase Client
|
| 28 |
+
SUPABASE_URL = os.environ.get("SUPABASE_URL")
|
| 29 |
+
# Use Service Role Key if available to bypass RLS
|
| 30 |
+
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
|
| 31 |
+
|
| 32 |
+
if not SUPABASE_URL or not SUPABASE_KEY:
|
| 33 |
+
raise RuntimeError("SUPABASE_URL and SUPABASE_KEY (or SUPABASE_SERVICE_ROLE_KEY) must be set in .env")
|
| 34 |
+
|
| 35 |
+
client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 36 |
+
|
| 37 |
+
# Define the data we expect from the frontend
|
| 38 |
+
class ResumeRequest(BaseModel):
|
| 39 |
+
user_id: str
|
| 40 |
+
file_path: str # e.g., "user_123/resume.pdf"
|
| 41 |
+
|
| 42 |
+
@app.post("/process-resume")
|
| 43 |
+
async def process_resume_endpoint(request: ResumeRequest):
|
| 44 |
+
print(f"🔔 Signal received: Process resume for {request.user_id}")
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
# Delegate everything to the unified function
|
| 48 |
+
extracted_data = process_resume(client, request.user_id, request.file_path)
|
| 49 |
+
return {"status": "success", "data": extracted_data}
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"❌ Error: {e}")
|
| 53 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 54 |
+
|
| 55 |
+
# ---------------------------------------------------------------------
|
| 56 |
+
# WEBHOOK ENDPOINT (Called by Supabase)
|
| 57 |
+
# ---------------------------------------------------------------------
|
| 58 |
+
|
| 59 |
+
from typing import Dict, Any, Optional
|
| 60 |
+
|
| 61 |
+
class StorageEventRequest(BaseModel):
|
| 62 |
+
type: str
|
| 63 |
+
table: str
|
| 64 |
+
record: Dict[str, Any]
|
| 65 |
+
schema: str
|
| 66 |
+
old_record: Optional[Dict[str, Any]] = None
|
| 67 |
+
|
| 68 |
+
@app.post("/webhook/storage")
|
| 69 |
+
async def storage_webhook(request: StorageEventRequest):
|
| 70 |
+
"""
|
| 71 |
+
Handles Database Webhooks from Supabase (storage.objects insert).
|
| 72 |
+
"""
|
| 73 |
+
print(f"🔔 Webhook received: {request.type} on {request.table}")
|
| 74 |
+
|
| 75 |
+
# We only care about INSERTs or UPDATEs (overwrites) to the 'resume' bucket
|
| 76 |
+
if request.type not in ["INSERT", "UPDATE"] or request.table != "objects":
|
| 77 |
+
return {"status": "ignored"}
|
| 78 |
+
|
| 79 |
+
# Extract file details from the record
|
| 80 |
+
# Object path example: "user_123/123456_resume.pdf"
|
| 81 |
+
file_path = request.record.get("name")
|
| 82 |
+
bucket_id = request.record.get("bucket_id")
|
| 83 |
+
|
| 84 |
+
# Check bucket
|
| 85 |
+
if bucket_id != "resume":
|
| 86 |
+
print(f"⚠️ Ignoring upload to bucket: {bucket_id}")
|
| 87 |
+
return {"status": "ignored", "reason": "wrong bucket"}
|
| 88 |
+
|
| 89 |
+
# Extract User ID (assuming folder structure: user_id/filename)
|
| 90 |
+
try:
|
| 91 |
+
user_id = file_path.split("/")[0]
|
| 92 |
+
except Exception:
|
| 93 |
+
print(f"❌ Could not extract user_id from {file_path}")
|
| 94 |
+
return {"status": "error", "message": "invalid file path structure"}
|
| 95 |
+
|
| 96 |
+
print(f"▶️ Triggering processing for {file_path}")
|
| 97 |
+
|
| 98 |
+
# Call the processing logic
|
| 99 |
+
try:
|
| 100 |
+
process_resume(client, user_id, file_path)
|
| 101 |
+
return {"status": "success"}
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"❌ Processing failed: {e}")
|
| 104 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
@app.post("/webhook/jobs")
|
| 108 |
+
async def jobs_webhook(request: StorageEventRequest):
|
| 109 |
+
"""
|
| 110 |
+
Handles Database Webhooks from Supabase (jobs table UPDATE/INSERT).
|
| 111 |
+
"""
|
| 112 |
+
print(f"🔔 Webhook received: {request.type} on {request.table}")
|
| 113 |
+
|
| 114 |
+
if request.table != "jobs":
|
| 115 |
+
return {"status": "ignored", "reason": "wrong table"}
|
| 116 |
+
|
| 117 |
+
# We care about INSERT and UPDATE
|
| 118 |
+
# For UPDATE, we might want to check if description changed, but for now we runs it anyway
|
| 119 |
+
|
| 120 |
+
new_record = request.record
|
| 121 |
+
job_id = new_record.get("id")
|
| 122 |
+
description = new_record.get("description")
|
| 123 |
+
experience_level = new_record.get("experience_level")
|
| 124 |
+
|
| 125 |
+
if not job_id:
|
| 126 |
+
print("❌ Webhook missing job_id")
|
| 127 |
+
return {"status": "error", "message": "missing id"}
|
| 128 |
+
|
| 129 |
+
print(f"▶️ Triggering job extraction for Job ID: {job_id}")
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
# Re-use global client from line 32
|
| 133 |
+
process_single_job(client, job_id, description, experience_level)
|
| 134 |
+
return {"status": "success"}
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f"❌ Job processing failed: {e}")
|
| 137 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 138 |
+
|
| 139 |
+
@app.post("/analyze-ats")
|
| 140 |
+
async def analyze_ats_endpoint(
|
| 141 |
+
resume: UploadFile = File(...),
|
| 142 |
+
job_description: str = Form(...)
|
| 143 |
+
):
|
| 144 |
+
"""
|
| 145 |
+
Real-time ATS analysis endpoint.
|
| 146 |
+
Does not save to DB (unless you want to add that logic).
|
| 147 |
+
"""
|
| 148 |
+
print(f"🔍 Analyzing ATS compatibility for: {resume.filename}")
|
| 149 |
+
try:
|
| 150 |
+
result = await analyze_ats_compatibility(resume, job_description)
|
| 151 |
+
return {"status": "success", "data": result}
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f"❌ ATS Analysis failed: {e}")
|
| 154 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# Run with: uvicorn api:app --reload
|
backend/create_profile_embeddings.sql
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Enable the pgvector extension to work with embedding vectors
|
| 2 |
+
create extension if not exists vector;
|
| 3 |
+
|
| 4 |
+
-- Create a table to store embeddings for each profile column
|
| 5 |
+
-- We use 1024 dimensions for the BAAI/bge-m3 model
|
| 6 |
+
create table if not exists profile_embeddings (
|
| 7 |
+
id uuid references profiles(id) on delete cascade primary key,
|
| 8 |
+
headline vector(1024),
|
| 9 |
+
summary vector(1024),
|
| 10 |
+
skills vector(1024),
|
| 11 |
+
technical_skills vector(1024),
|
| 12 |
+
experience vector(1024),
|
| 13 |
+
certifications vector(1024),
|
| 14 |
+
languages vector(1024),
|
| 15 |
+
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
|
| 16 |
+
updated_at timestamp with time zone default timezone('utc'::text, now()) not null
|
| 17 |
+
);
|
| 18 |
+
|
| 19 |
+
-- Enable Row Level Security (RLS)
|
| 20 |
+
alter table profile_embeddings enable row level security;
|
| 21 |
+
|
| 22 |
+
-- Create policies (Adjust based on your actual auth requirements)
|
| 23 |
+
-- Allow read access to everyone (or authenticated users)
|
| 24 |
+
create policy "Allow read access for all users"
|
| 25 |
+
on profile_embeddings for select
|
| 26 |
+
using ( true );
|
| 27 |
+
|
| 28 |
+
-- Allow update/insert only for service_role or the user who owns the profile
|
| 29 |
+
-- (Assuming auth.uid() matches the profile id)
|
| 30 |
+
create policy "Users can update their own embeddings"
|
| 31 |
+
on profile_embeddings for all
|
| 32 |
+
using ( auth.uid() = id );
|
backend/debug_payload.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"id": "81185bdc-85be-4ff2-99c7-16cf8356cb51",
|
| 3 |
+
"resume_url": "81185bdc-85be-4ff2-99c7-16cf8356cb51/resume.pdf",
|
| 4 |
+
"file_hash": "f6f9d1e0b3badc01329126aa9f249a3e26f1ba12e26d274de0323c359faa1c13",
|
| 5 |
+
"processed": true,
|
| 6 |
+
"updated_at": "now()",
|
| 7 |
+
"full_name": "med Raffi",
|
| 8 |
+
"summary": "Computer Science student proficient in Python, Java, and C with strong skills in Object-Oriented Programming. Experienced in software development and version control using Git. Adaptable team player focused on solving complex technical challenges.",
|
| 9 |
+
"phone": "+9195390771",
|
| 10 |
+
"email": "saheedmuhammedraffi@gmail.com",
|
| 11 |
+
"skills": [
|
| 12 |
+
"Communication",
|
| 13 |
+
"Teamwork",
|
| 14 |
+
"Adaptability",
|
| 15 |
+
"Analytical Thinking"
|
| 16 |
+
],
|
| 17 |
+
"technical_skills": "Python, Java, C, SQL, HTML, CSS, JavaScript, Flask, React, Pandas, Scikit-learn, NumPy, Git, VSCode, GoogleColab, Docker, TensorFlow",
|
| 18 |
+
"education": [
|
| 19 |
+
{
|
| 20 |
+
"course": "B.Tech in Computer Science and Engineering",
|
| 21 |
+
"institution": "Carmel College of Engineering and Technology, Alappuzha",
|
| 22 |
+
"year": "2022 Present"
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"course": "Higher Secondary Education",
|
| 26 |
+
"institution": "S.D.V. English Medium Higher Secondary School, Alappuzha",
|
| 27 |
+
"year": null
|
| 28 |
+
}
|
| 29 |
+
],
|
| 30 |
+
"work_experience": [
|
| 31 |
+
{
|
| 32 |
+
"role": "AI/ML Intern",
|
| 33 |
+
"company": "ICT Academy of Kerala, Trivandrum",
|
| 34 |
+
"years": "Jun 2025 - Jul 2025",
|
| 35 |
+
"description": "Underwent a 1-month internship on Artificial Intelligence and Machine Learning. Collaborated with a 5-member team to deploy a prototype ML model tested on real-world datasets."
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"role": "Webmaster",
|
| 39 |
+
"company": "IEEE Computer Society",
|
| 40 |
+
"years": "July 2025 Present",
|
| 41 |
+
"description": "Developed a responsive web portal and admin dashboard to streamline real-time event tracking and member registration."
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"role": "TEDxCCET Curation Lead",
|
| 45 |
+
"company": "Dept. of Computer Science, CCET",
|
| 46 |
+
"years": "Nov 2025 Present",
|
| 47 |
+
"description": "Manage speaker logistics, schedules, and deliverables to ensure strict adherence to event timelines. Coordinate technical requirements and stage cues between speakers and the production team."
|
| 48 |
+
}
|
| 49 |
+
],
|
| 50 |
+
"projects": [
|
| 51 |
+
{
|
| 52 |
+
"tech_stack": [
|
| 53 |
+
"React",
|
| 54 |
+
"Supabase"
|
| 55 |
+
],
|
| 56 |
+
"description": "A full-stack milk management and distribution system that automates milk collection, farmer payments, billing, and delivery tracking through a centralized platform."
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"tech_stack": [
|
| 60 |
+
"Python",
|
| 61 |
+
"TensorFlow",
|
| 62 |
+
"Flask",
|
| 63 |
+
"React"
|
| 64 |
+
],
|
| 65 |
+
"description": "Developed an LSTM-based model to forecast short-term stock prices using live data. Integrated the trained model into a Flask API with a React interface for real-time trend prediction."
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"certifications": "AIML Internship ICT Academy of Kerala 2025, Python Foundation Certification Springboard 2025, Programming in Java NPTEL 2024"
|
| 69 |
+
}
|
backend/debug_resume.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Candidate Name: Jane Doe
|
| 2 |
+
Email: jane@example.com
|
| 3 |
+
Projects:
|
| 4 |
+
1. E-Commerce App
|
| 5 |
+
Tech Stack: React, Node.js
|
| 6 |
+
Description: A shopping site.
|
backend/requirements.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ================== Core utilities ==================
|
| 2 |
+
python-dotenv>=1.0.0
|
| 3 |
+
pandas>=2.0.0
|
| 4 |
+
tqdm>=4.66.0
|
| 5 |
+
|
| 6 |
+
# ================== PDF / DOC processing ==================
|
| 7 |
+
pypdf>=3.0.0
|
| 8 |
+
pdfplumber>=0.10.0
|
| 9 |
+
python-docx>=0.8.11
|
| 10 |
+
unicodedata2>=0.7.2
|
| 11 |
+
|
| 12 |
+
# ================== NLP preprocessing ==================
|
| 13 |
+
nltk>=3.8.1
|
| 14 |
+
|
| 15 |
+
# ================== Hugging Face / ML ==================
|
| 16 |
+
transformers>=4.44.0
|
| 17 |
+
torch>=2.2.0
|
| 18 |
+
sentence-transformers>=2.2.2
|
| 19 |
+
datasets>=2.19.0
|
| 20 |
+
accelerate>=0.30.0
|
| 21 |
+
|
| 22 |
+
# ================== APIs ==================
|
| 23 |
+
openai>=1.30.0
|
| 24 |
+
supabase>=2.0.0
|
| 25 |
+
fastapi>=0.109.0
|
| 26 |
+
uvicorn>=0.27.0
|
| 27 |
+
python-multipart>=0.0.9
|
| 28 |
+
google-genai>=0.2.0
|
backend/src/__init__.py
ADDED
|
File without changes
|
backend/src/embeddings/__init__.py
ADDED
|
File without changes
|
backend/src/embeddings/debug_embedding_storage.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
# Add 'backend' directory to path so we can import 'supabase_ingest' directly
|
| 7 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
| 8 |
+
|
| 9 |
+
from supabase_ingest import safe_generate_and_store_embeddings, client
|
| 10 |
+
|
| 11 |
+
# Mock data
|
| 12 |
+
user_id = "test_user_debug_123"
|
| 13 |
+
extracted_data = {
|
| 14 |
+
"headline": "Debug Engineer",
|
| 15 |
+
"summary": "This is a test summary for debugging.",
|
| 16 |
+
"skills": "Debug, Python", # DB stores as string
|
| 17 |
+
"technical_skills": "SQL, Vector DB", # DB stores as string
|
| 18 |
+
"certifications": "",
|
| 19 |
+
"languages": "English" # DB stores as string
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
print(f"DEBUG: Testing embedding storage for User ID: {user_id}")
|
| 23 |
+
|
| 24 |
+
# 1. Ensure user exists in profiles first (FK constraint)
|
| 25 |
+
try:
|
| 26 |
+
print("DEBUG: Ensuring profile exists...")
|
| 27 |
+
# UPSERT the mock data into the profiles table so the function can fetch it
|
| 28 |
+
profile_payload = {
|
| 29 |
+
"id": user_id,
|
| 30 |
+
"full_name": "Debug User",
|
| 31 |
+
"email": "debug@example.com",
|
| 32 |
+
"updated_at": "now()",
|
| 33 |
+
# Add the fields we expect to be there
|
| 34 |
+
"headline": extracted_data["headline"],
|
| 35 |
+
"summary": extracted_data["summary"],
|
| 36 |
+
"skills": extracted_data["skills"],
|
| 37 |
+
"technical_skills": extracted_data["technical_skills"],
|
| 38 |
+
"certifications": extracted_data["certifications"],
|
| 39 |
+
"languages": extracted_data["languages"]
|
| 40 |
+
}
|
| 41 |
+
client.table("profiles").upsert(profile_payload).execute()
|
| 42 |
+
print("DEBUG: Profile upserted.")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"❌ Failed to create test profile: {e}")
|
| 45 |
+
sys.exit(1)
|
| 46 |
+
|
| 47 |
+
# 2. Run the function
|
| 48 |
+
print("DEBUG: Running safe_generate_and_store_embeddings...")
|
| 49 |
+
# Now it fetches from DB internally, so we don't pass extracted_data
|
| 50 |
+
safe_generate_and_store_embeddings(client, user_id)
|
| 51 |
+
|
| 52 |
+
# 3. Check if it exists
|
| 53 |
+
try:
|
| 54 |
+
print("DEBUG: Verifying storage...")
|
| 55 |
+
resp = client.table("profile_embeddings").select("*").eq("id", user_id).execute()
|
| 56 |
+
if resp.data:
|
| 57 |
+
print("✅ SUCCESS: Embedding record found!")
|
| 58 |
+
print(f"Data keys: {resp.data[0].keys()}")
|
| 59 |
+
else:
|
| 60 |
+
print("❌ FAILURE: No record found in profile_embeddings.")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
print(f"❌ Verification failed: {e}")
|
backend/src/embeddings/job_embed.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
from typing import List
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from supabase import create_client
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
# Load env
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
SUPABASE_URL = os.environ.get("SUPABASE_URL")
|
| 12 |
+
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
|
| 13 |
+
|
| 14 |
+
# Singleton model (same pattern as profile code)
|
| 15 |
+
_model = None
|
| 16 |
+
|
| 17 |
+
def get_model():
|
| 18 |
+
global _model
|
| 19 |
+
if _model is None:
|
| 20 |
+
print("📥 Loading BAAI/bge-m3 model for job embeddings...")
|
| 21 |
+
_model = SentenceTransformer("BAAI/bge-m3")
|
| 22 |
+
return _model
|
| 23 |
+
|
| 24 |
+
def get_supabase():
|
| 25 |
+
if not SUPABASE_URL or not SUPABASE_KEY:
|
| 26 |
+
print("❌ Missing Supabase credentials for job embeddings.")
|
| 27 |
+
return None
|
| 28 |
+
return create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 29 |
+
|
| 30 |
+
# -------- Embedding helpers (IDENTICAL LOGIC) --------
|
| 31 |
+
|
| 32 |
+
def generate_embedding(text: str) -> List[float]:
|
| 33 |
+
if not text or not text.strip():
|
| 34 |
+
return [0.0] * 1024
|
| 35 |
+
|
| 36 |
+
model = get_model()
|
| 37 |
+
embedding = model.encode(text, normalize_embeddings=True)
|
| 38 |
+
return embedding.tolist()
|
| 39 |
+
|
| 40 |
+
def generate_list_embedding(items: List[str]) -> List[float]:
|
| 41 |
+
if not items:
|
| 42 |
+
return [0.0] * 1024
|
| 43 |
+
|
| 44 |
+
model = get_model()
|
| 45 |
+
embeddings = model.encode(items, normalize_embeddings=True)
|
| 46 |
+
mean_embedding = np.mean(embeddings, axis=0)
|
| 47 |
+
return mean_embedding.tolist()
|
| 48 |
+
|
| 49 |
+
# ----------------------------------------------------
|
| 50 |
+
|
| 51 |
+
def safe_generate_and_store_job_embeddings(client, job_id: str) -> None:
|
| 52 |
+
"""
|
| 53 |
+
Fetches job entities, generates entity-wise embeddings,
|
| 54 |
+
and upserts them into job_embeddings table.
|
| 55 |
+
"""
|
| 56 |
+
print(f"🧬 Generating job embeddings for Job: {job_id}")
|
| 57 |
+
|
| 58 |
+
# 1. Fetch job entities
|
| 59 |
+
resp = client.table("job_entities") \
|
| 60 |
+
.select("*") \
|
| 61 |
+
.eq("job_id", job_id) \
|
| 62 |
+
.execute()
|
| 63 |
+
|
| 64 |
+
if not resp.data:
|
| 65 |
+
print(f"⚠️ Job entities not found for job_id={job_id}")
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
entities = resp.data[0]
|
| 69 |
+
|
| 70 |
+
# 2. Parse list fields safely (same pattern)
|
| 71 |
+
def parse_list(val):
|
| 72 |
+
if not val:
|
| 73 |
+
return []
|
| 74 |
+
if isinstance(val, list):
|
| 75 |
+
return val
|
| 76 |
+
if isinstance(val, str):
|
| 77 |
+
return [x.strip() for x in val.split(",") if x.strip()]
|
| 78 |
+
return []
|
| 79 |
+
|
| 80 |
+
skills = parse_list(entities.get("skills"))
|
| 81 |
+
technical_skills = parse_list(entities.get("technical_skills"))
|
| 82 |
+
tools = parse_list(entities.get("tools"))
|
| 83 |
+
certifications = parse_list(entities.get("certifications"))
|
| 84 |
+
|
| 85 |
+
experience = entities.get("experience") or ""
|
| 86 |
+
education = entities.get("education") or ""
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
# 3. Generate embeddings (ENTITY-WISE)
|
| 90 |
+
payload = {
|
| 91 |
+
"job_id": job_id,
|
| 92 |
+
"skills": generate_list_embedding(skills),
|
| 93 |
+
"technical_skills": generate_list_embedding(technical_skills),
|
| 94 |
+
"tools": generate_list_embedding(tools),
|
| 95 |
+
"experience": generate_embedding(experience),
|
| 96 |
+
"education": generate_embedding(education),
|
| 97 |
+
"certifications": generate_list_embedding(certifications),
|
| 98 |
+
"updated_at": "now()"
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# 4. Upsert into job_embeddings
|
| 102 |
+
client.table("job_embeddings").upsert(payload).execute()
|
| 103 |
+
print(f"✅ Job embeddings stored for job_id={job_id}")
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"❌ Job embedding generation failed: {e}")
|
| 107 |
+
|
| 108 |
+
|
backend/src/embeddings/local_embedder.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Any
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from supabase import create_client
|
| 8 |
+
from sentence_transformers import SentenceTransformer
|
| 9 |
+
|
| 10 |
+
# Load env
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
SUPABASE_URL = os.environ.get("SUPABASE_URL")
|
| 14 |
+
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
|
| 15 |
+
|
| 16 |
+
# Initialize Model (Globals are bad but efficient for serverless-ish/script use)
|
| 17 |
+
# Using a singleton pattern to avoid reloading model on every call if imported
|
| 18 |
+
_model = None
|
| 19 |
+
|
| 20 |
+
def get_model():
|
| 21 |
+
global _model
|
| 22 |
+
if _model is None:
|
| 23 |
+
print("📥 Loading BAAI/bge-m3 model...")
|
| 24 |
+
_model = SentenceTransformer('BAAI/bge-m3')
|
| 25 |
+
return _model
|
| 26 |
+
|
| 27 |
+
def get_supabase():
|
| 28 |
+
if not SUPABASE_URL or not SUPABASE_KEY:
|
| 29 |
+
print("❌ Missing Supabase credentials for embeddings.")
|
| 30 |
+
return None
|
| 31 |
+
return create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 32 |
+
|
| 33 |
+
def generate_embedding(text: str) -> List[float]:
|
| 34 |
+
if not text or not text.strip():
|
| 35 |
+
return [0.0] * 1024 # BGE-M3 is 1024d
|
| 36 |
+
|
| 37 |
+
model = get_model()
|
| 38 |
+
# BGE-M3 returns 1024 dim
|
| 39 |
+
embedding = model.encode(text, normalize_embeddings=True)
|
| 40 |
+
return embedding.tolist()
|
| 41 |
+
|
| 42 |
+
def generate_list_embedding(items: List[str]) -> List[float]:
|
| 43 |
+
if not items:
|
| 44 |
+
return [0.0] * 1024
|
| 45 |
+
|
| 46 |
+
model = get_model()
|
| 47 |
+
embeddings = model.encode(items, normalize_embeddings=True)
|
| 48 |
+
# Mean pooling
|
| 49 |
+
mean_embedding = np.mean(embeddings, axis=0)
|
| 50 |
+
return mean_embedding.tolist()
|
| 51 |
+
|
| 52 |
+
def safe_generate_and_store_embeddings(client, user_id: str) -> None:
|
| 53 |
+
"""
|
| 54 |
+
Fetches profile data, generates embeddings, and upserts to profile_embeddings.
|
| 55 |
+
"""
|
| 56 |
+
print(f"🧬 Generating embeddings for User: {user_id}")
|
| 57 |
+
|
| 58 |
+
# 1. Fetch Profile
|
| 59 |
+
resp = client.table("profiles").select("*").eq("id", user_id).execute()
|
| 60 |
+
if not resp.data:
|
| 61 |
+
print(f"⚠️ Profile not found for {user_id}")
|
| 62 |
+
return
|
| 63 |
+
|
| 64 |
+
profile = resp.data[0]
|
| 65 |
+
|
| 66 |
+
# 2. Extract Fields
|
| 67 |
+
# Text fields
|
| 68 |
+
summary = profile.get("summary") or ""
|
| 69 |
+
headline = profile.get("headline") or ""
|
| 70 |
+
role = profile.get("role") or ""
|
| 71 |
+
|
| 72 |
+
# Lists (CSV or Array) - Handle both just in case
|
| 73 |
+
def parse_list(val):
|
| 74 |
+
if not val: return []
|
| 75 |
+
if isinstance(val, list): return val
|
| 76 |
+
if isinstance(val, str): return [x.strip() for x in val.split(",") if x.strip()]
|
| 77 |
+
return []
|
| 78 |
+
|
| 79 |
+
skills = parse_list(profile.get("skills"))
|
| 80 |
+
tech_skills = parse_list(profile.get("technical_skills"))
|
| 81 |
+
# For experience and education, we might need more complex parsing if stored as JSONB
|
| 82 |
+
# But for now let's assume simple text representation or skip if complex JSON
|
| 83 |
+
# If experience is JSONB, we'll serialize it to text for embedding
|
| 84 |
+
experience_raw = profile.get("work_experience") or []
|
| 85 |
+
if isinstance(experience_raw, list):
|
| 86 |
+
# It's a list of objects or strings. Convert to list of strings.
|
| 87 |
+
experience_texts = []
|
| 88 |
+
for item in experience_raw:
|
| 89 |
+
if isinstance(item, dict):
|
| 90 |
+
# Flatten: "Role at Company (Year): Description"
|
| 91 |
+
role_ = item.get("role") or ""
|
| 92 |
+
comp_ = item.get("company") or ""
|
| 93 |
+
desc_ = item.get("description") or ""
|
| 94 |
+
text = f"{role_} at {comp_}. {desc_}"
|
| 95 |
+
experience_texts.append(text)
|
| 96 |
+
elif isinstance(item, str):
|
| 97 |
+
experience_texts.append(item)
|
| 98 |
+
experience = experience_texts
|
| 99 |
+
else:
|
| 100 |
+
experience = []
|
| 101 |
+
|
| 102 |
+
# 3. Generate Embeddings (Extra fields for completeness)
|
| 103 |
+
certifications = parse_list(profile.get("certifications"))
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
current_position_emb = generate_embedding(f"{role} {headline}")
|
| 107 |
+
summary_emb = generate_embedding(summary)
|
| 108 |
+
skills_emb = generate_list_embedding(skills)
|
| 109 |
+
technical_skills_emb = generate_list_embedding(tech_skills)
|
| 110 |
+
experience_emb = generate_list_embedding(experience)
|
| 111 |
+
certifications_emb = generate_list_embedding(certifications)
|
| 112 |
+
|
| 113 |
+
# 4. Upsert
|
| 114 |
+
# Matches columns in create_profile_embeddings.sql
|
| 115 |
+
payload = {
|
| 116 |
+
"id": user_id,
|
| 117 |
+
"headline": current_position_emb,
|
| 118 |
+
"summary": summary_emb,
|
| 119 |
+
"skills": skills_emb,
|
| 120 |
+
"technical_skills": technical_skills_emb,
|
| 121 |
+
"experience": experience_emb,
|
| 122 |
+
"certifications": certifications_emb,
|
| 123 |
+
"updated_at": "now()"
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
client.table("profile_embeddings").upsert(payload).execute()
|
| 127 |
+
print(f"✅ Embeddings stored for {user_id}")
|
| 128 |
+
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"❌ Embedding generation failed: {e}")
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
# Test run
|
| 134 |
+
sb = get_supabase()
|
| 135 |
+
if sb:
|
| 136 |
+
# Replace with a valid ID for testing if needed
|
| 137 |
+
pass
|
backend/src/embeddings/process_all_profiles.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
# Add backend to path
|
| 7 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
| 8 |
+
|
| 9 |
+
from supabase_ingest import client, safe_generate_and_store_embeddings
|
| 10 |
+
|
| 11 |
+
def process_all_profiles():
|
| 12 |
+
print("🔍 Fetching all user IDs from 'profiles' table...")
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
# Fetch all profiles (just IDs needed to trigger the function)
|
| 16 |
+
response = client.table("profiles").select("id").execute()
|
| 17 |
+
|
| 18 |
+
if not response.data:
|
| 19 |
+
print("⚠️ No profiles found in database.")
|
| 20 |
+
return
|
| 21 |
+
|
| 22 |
+
profiles = response.data
|
| 23 |
+
total = len(profiles)
|
| 24 |
+
print(f"✅ Found {total} profiles to process.")
|
| 25 |
+
|
| 26 |
+
for i, profile in enumerate(profiles):
|
| 27 |
+
user_id = profile['id']
|
| 28 |
+
print(f"\n[{i+1}/{total}] Processing User ID: {user_id}")
|
| 29 |
+
|
| 30 |
+
# This function now handles:
|
| 31 |
+
# 1. Fetching the full profile data from DB
|
| 32 |
+
# 2. Parsing CSV lists
|
| 33 |
+
# 3. Generating BGE-M3 embeddings
|
| 34 |
+
# 4. Upserting to profile_embeddings
|
| 35 |
+
safe_generate_and_store_embeddings(client, user_id)
|
| 36 |
+
|
| 37 |
+
# Small delay to be nice to the CPU/API
|
| 38 |
+
# time.sleep(0.1)
|
| 39 |
+
|
| 40 |
+
print("\n🎉 Batch processing complete!")
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"❌ Error fetching profiles: {e}")
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
process_all_profiles()
|
backend/src/embeddings/test_embedder.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
# Add backend to path
|
| 7 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
|
| 8 |
+
|
| 9 |
+
from backend.src.embeddings.local_embedder import generate_embeddings
|
| 10 |
+
|
| 11 |
+
sample_data = {
|
| 12 |
+
"headline": "Senior Software Engineer",
|
| 13 |
+
"summary": "Experienced in Python and AI.",
|
| 14 |
+
"skills": ["Communication", "Leadership", "Agile"],
|
| 15 |
+
"technical_skills": ["Python", "FastAPI", "React"],
|
| 16 |
+
"certifications": [], # Empty list
|
| 17 |
+
"languages": ["English", "Spanish"]
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
print("Running Embedding Generation Test...")
|
| 21 |
+
embeddings = generate_embeddings(sample_data)
|
| 22 |
+
|
| 23 |
+
print("\nResults:")
|
| 24 |
+
for key, vector in embeddings.items():
|
| 25 |
+
vec_len = len(vector)
|
| 26 |
+
print(f"Field: {key:20} | Dimensions: {vec_len} | Sample: {vector[:3]}...")
|
| 27 |
+
|
| 28 |
+
if vec_len != 1024:
|
| 29 |
+
print(f"❌ ERROR: Expected 1024 dimensions, got {vec_len}")
|
| 30 |
+
|
| 31 |
+
if "certifications" not in embeddings:
|
| 32 |
+
print("Field: certifications | Correctly skipped (empty)")
|
| 33 |
+
|
| 34 |
+
print("\nDone.")
|
backend/src/extraction/__init__.py
ADDED
|
File without changes
|
backend/src/extraction/fallback_extractor.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def extract_fallback(text: str) -> dict:
|
| 4 |
+
"""
|
| 5 |
+
A dumb Regex-based fallback extractor if Gemini fails.
|
| 6 |
+
Extracts basic info like Email, Phone, Links, and keyword-matched Skills.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# 1. Email (Basic)
|
| 10 |
+
email_params = r"[\w\.-]+@[\w\.-]+\.\w+"
|
| 11 |
+
email_match = re.search(email_params, text)
|
| 12 |
+
email = email_match.group(0) if email_match else None
|
| 13 |
+
|
| 14 |
+
# 2. Phone (Very Basic - catches 10-12 digit numbers)
|
| 15 |
+
phone_match = re.search(r"(\+?\d{1,3}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}", text)
|
| 16 |
+
phone = phone_match.group(0) if phone_match else None
|
| 17 |
+
|
| 18 |
+
# 3. Links (LinkedIn / GitHub / Portfolio)
|
| 19 |
+
links = re.findall(r"https?://[^\s]+", text)
|
| 20 |
+
linkedin = next((l for l in links if "linkedin.com" in l), None)
|
| 21 |
+
github = next((l for l in links if "github.com" in l), None)
|
| 22 |
+
portfolio = next((l for l in links if l not in [linkedin, github]), None)
|
| 23 |
+
|
| 24 |
+
# 4. Keyword Matching for Skills (Static List)
|
| 25 |
+
COMMON_SKILLS = [
|
| 26 |
+
"Python", "Java", "JavaScript", "TypeScript", "C++", "C#", "SQL", "NoSQL",
|
| 27 |
+
"React", "Angular", "Vue", "Node.js", "Django", "Flask", "FastAPI",
|
| 28 |
+
"AWS", "Azure", "GCP", "Docker", "Kubernetes", "Git", "CI/CD",
|
| 29 |
+
"Machine Learning", "Deep Learning", "NLP", "Pandas", "NumPy", "TensorFlow", "PyTorch"
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
found_skills = [skill for skill in COMMON_SKILLS if re.search(r"\b" + re.escape(skill) + r"\b", text, re.IGNORECASE)]
|
| 33 |
+
|
| 34 |
+
# 5. Construct Payload (Matches Schema)
|
| 35 |
+
return {
|
| 36 |
+
"headline": None,
|
| 37 |
+
"summary": text[:500] + "..." if len(text) > 500 else text, # Fallback summary is just first 500 chars
|
| 38 |
+
"skills": found_skills,
|
| 39 |
+
"technical_skills": found_skills, # Duplicate for safety
|
| 40 |
+
"education": [],
|
| 41 |
+
"work_experience": [],
|
| 42 |
+
"certifications": [],
|
| 43 |
+
"languages": [],
|
| 44 |
+
"experience_years": None,
|
| 45 |
+
# Extra fields specific to Supabase Ingest (mapped later)
|
| 46 |
+
# "email": email, # Backend doesn't use extracted email usually (uses auth), but good to have
|
| 47 |
+
"phone": phone,
|
| 48 |
+
"linkedin": linkedin,
|
| 49 |
+
"github": github,
|
| 50 |
+
"portfolio": portfolio
|
| 51 |
+
}
|
backend/src/extraction/job_extractor.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
from typing import Any, Dict, List, Optional
|
| 7 |
+
from google import genai
|
| 8 |
+
from google.genai import types
|
| 9 |
+
|
| 10 |
+
from supabase import create_client
|
| 11 |
+
|
| 12 |
+
# ------------------ CONFIGURATION ------------------
|
| 13 |
+
RAW_DIR = "data/jobs/raw"
|
| 14 |
+
PROCESSED_DIR = "data/jobs/entities"
|
| 15 |
+
|
| 16 |
+
# ------------------ SETUP ------------------
|
| 17 |
+
load_dotenv()
|
| 18 |
+
SUPABASE_URL = os.environ.get("SUPABASE_URL")
|
| 19 |
+
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") or os.environ.get("SUPABASE_KEY")
|
| 20 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 21 |
+
|
| 22 |
+
if GEMINI_API_KEY:
|
| 23 |
+
try:
|
| 24 |
+
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 25 |
+
except Exception as e:
|
| 26 |
+
client = None
|
| 27 |
+
print(f"⚠️ Failed to initialize Gemini client: {e}")
|
| 28 |
+
else:
|
| 29 |
+
client = None
|
| 30 |
+
print("⚠️ GEMINI_API_KEY not set; extraction will be disabled.")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def clean_text(text: str) -> str:
|
| 34 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 35 |
+
text = re.sub(r"[^\x00-\x7F]+", " ", text)
|
| 36 |
+
text = re.sub(r"\s+", " ", text)
|
| 37 |
+
return text.strip()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def extract_job_entities_gemini(text: str) -> Dict[str, Any]:
|
| 41 |
+
cleaned_text = clean_text(text)
|
| 42 |
+
|
| 43 |
+
system_prompt = """
|
| 44 |
+
You are an intelligent information extractor specialized in job descriptions.
|
| 45 |
+
Your task is to extract ONLY what is explicitly mentioned and categorize them into the following JSON structure.
|
| 46 |
+
|
| 47 |
+
Output JSON Schema:
|
| 48 |
+
{
|
| 49 |
+
"skills": ["List of soft skills, general competencies..."],
|
| 50 |
+
"technical_skills": ["List of technical skills, programming languages, tools..."],
|
| 51 |
+
"qualification": ["List of educational qualifications..."],
|
| 52 |
+
"work_experience": ["List of work experience requirements..."],
|
| 53 |
+
"preferred_skills": ["List of preferred/nice-to-have skills..."]
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
Rules:
|
| 57 |
+
- Extract exact text as it appears.
|
| 58 |
+
- Do NOT infer or add anything not stated.
|
| 59 |
+
- If no data for a category, return an empty list [].
|
| 60 |
+
- Output MUST be valid JSON.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
if client is None:
|
| 64 |
+
print("❌ Extraction disabled (no Client).")
|
| 65 |
+
return {}
|
| 66 |
+
|
| 67 |
+
max_retries = 3
|
| 68 |
+
for attempt in range(max_retries):
|
| 69 |
+
try:
|
| 70 |
+
response = client.models.generate_content(
|
| 71 |
+
model="gemini-2.5-flash-lite",
|
| 72 |
+
contents=system_prompt + "\n\nJOB DESCRIPTION:\n" + cleaned_text,
|
| 73 |
+
config=types.GenerateContentConfig(
|
| 74 |
+
temperature=0.1,
|
| 75 |
+
response_mime_type="application/json"
|
| 76 |
+
)
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
extracted_text = response.text.strip()
|
| 80 |
+
# Clean potential markdown fences if present (though response_mime_type usually handles it)
|
| 81 |
+
if extracted_text.startswith("```json"):
|
| 82 |
+
extracted_text = extracted_text[7:]
|
| 83 |
+
if extracted_text.startswith("```"):
|
| 84 |
+
extracted_text = extracted_text[3:]
|
| 85 |
+
if extracted_text.endswith("```"):
|
| 86 |
+
extracted_text = extracted_text[:-3]
|
| 87 |
+
|
| 88 |
+
return json.loads(extracted_text)
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
error_str = str(e)
|
| 92 |
+
if "503" in error_str or "overloaded" in error_str.lower():
|
| 93 |
+
wait_time = 2 ** (attempt + 1)
|
| 94 |
+
print(f"⚠️ Model overloaded. Retrying in {wait_time}s...")
|
| 95 |
+
time.sleep(wait_time)
|
| 96 |
+
else:
|
| 97 |
+
print(f"❌ Gemini Extraction failed: {e}")
|
| 98 |
+
return {}
|
| 99 |
+
|
| 100 |
+
return {}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def upsert_job_entities(sb, job_id: str, experience_level: str, data: Dict[str, Any]) -> None:
|
| 104 |
+
"""
|
| 105 |
+
Upserts the extracted entities into the jobs_entities table.
|
| 106 |
+
"""
|
| 107 |
+
payload = {
|
| 108 |
+
"job_id": job_id,
|
| 109 |
+
"experience_level": experience_level,
|
| 110 |
+
"skills": data.get("skills", []),
|
| 111 |
+
"technical_skills": data.get("technical_skills", []),
|
| 112 |
+
"qualification": data.get("qualification", []),
|
| 113 |
+
"work_experience": data.get("work_experience", []),
|
| 114 |
+
"preferred_skills": data.get("preferred_skills", []),
|
| 115 |
+
"updated_at": "now()"
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
sb.table("jobs_entities").upsert(payload).execute()
|
| 120 |
+
print(f"✅ Database updated for Job ID: {job_id}")
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"❌ DB Upsert Error for {job_id}: {e}")
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def process_single_job(sb, job_id: str, description: str, experience_level: str = None) -> None:
|
| 126 |
+
"""
|
| 127 |
+
Processes a single job: extracts entities and upserts to DB.
|
| 128 |
+
"""
|
| 129 |
+
if not description or not description.strip():
|
| 130 |
+
print(f"⚠️ Skipping empty description for job {job_id}")
|
| 131 |
+
return
|
| 132 |
+
|
| 133 |
+
print(f"🔍 Processing Job ID: {job_id}")
|
| 134 |
+
|
| 135 |
+
extracted_data = extract_job_entities_gemini(description)
|
| 136 |
+
if not extracted_data:
|
| 137 |
+
print("⚠️ No entities extracted.")
|
| 138 |
+
return
|
| 139 |
+
|
| 140 |
+
upsert_job_entities(sb, job_id, experience_level, extracted_data)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def process_jobs_from_db() -> None:
|
| 145 |
+
if not SUPABASE_URL or not SUPABASE_KEY:
|
| 146 |
+
print("⚠️ SUPABASE_URL or SUPABASE_KEY not set; skipping job fetch")
|
| 147 |
+
return
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
sb = create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"⚠️ Failed to create Supabase client: {e}")
|
| 153 |
+
return
|
| 154 |
+
|
| 155 |
+
# Fetch jobs from 'jobs' table
|
| 156 |
+
try:
|
| 157 |
+
resp = sb.table("jobs").select("id, description, experience_level").execute()
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f"⚠️ Supabase query failed: {e}")
|
| 160 |
+
return
|
| 161 |
+
|
| 162 |
+
data = resp.data if hasattr(resp, "data") else []
|
| 163 |
+
if not data:
|
| 164 |
+
print("⚠️ No job descriptions returned from Supabase.")
|
| 165 |
+
return
|
| 166 |
+
|
| 167 |
+
print(f"found {len(data)} jobs to process.")
|
| 168 |
+
|
| 169 |
+
os.makedirs(PROCESSED_DIR, exist_ok=True)
|
| 170 |
+
|
| 171 |
+
for row in data:
|
| 172 |
+
job_id = row.get("id")
|
| 173 |
+
desc = row.get("description") or ""
|
| 174 |
+
|
| 175 |
+
process_single_job(sb, job_id, desc, experience_level)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
if __name__ == '__main__':
|
| 179 |
+
print("🧪 Starting job entity extraction (DB -> Gemini -> DB)...\n")
|
| 180 |
+
process_jobs_from_db()
|
| 181 |
+
print("\n🎯 All jobs processed.")
|
backend/src/extraction/person_details_extraction_gemini.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.ingestion.parser import parse_file
|
| 2 |
+
import json
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import os
|
| 6 |
+
from google import genai
|
| 7 |
+
import google.genai.types as types
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
# Load env
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
client=genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
| 15 |
+
|
| 16 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 17 |
+
RAW_DIR = BASE_DIR / "data" / "resumes" / "raw"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
SYSTEM_PROMPT = """
|
| 21 |
+
You are a precise resume entity extraction engine.
|
| 22 |
+
|
| 23 |
+
TASK:
|
| 24 |
+
Extract ONLY the information explicitly present in the resume text.
|
| 25 |
+
|
| 26 |
+
OUTPUT RULES:
|
| 27 |
+
- Output MUST be valid JSON
|
| 28 |
+
- Do NOT hallucinate. If a field is missing, use null.
|
| 29 |
+
-Include empty lists for missing array fields.
|
| 30 |
+
-Include all fields in the output, even if null or empty.
|
| 31 |
+
-Include only the fields specified in the schema below.
|
| 32 |
+
- Do NOT include any explanations, notes, or extra text outside the JSON.
|
| 33 |
+
- Ensure the JSON is properly formatted and parsable.
|
| 34 |
+
- Return "work_experience" as a LIST of objects with fields: role, company, year, duration, description.
|
| 35 |
+
- Calculate "duration" (e.g. "2 years", "6 months") from dates if not explicitly stated.
|
| 36 |
+
- Return "education" as a LIST of objects with fields: course, institution, year.
|
| 37 |
+
- For "skills", "technical_skills", "certifications", and "languages", return LISTS of strings.
|
| 38 |
+
- **CRITICAL**: "languages" refers ONLY to human spoken/written languages (e.g., English, Hindi, Spanish). Programming languages (Python, Java, etc.) MUST go into "technical_skills".
|
| 39 |
+
- For single-value fields like "role", "headline", "summary" and return STRING or null.
|
| 40 |
+
-Calculate experience_years as an INTEGER representing total years of experience, or null if not derivable.
|
| 41 |
+
-only use the field names and structure defined in the schema below.
|
| 42 |
+
-strictLY follow the JSON schema provided.
|
| 43 |
+
|
| 44 |
+
JSON SCHEMA:
|
| 45 |
+
{
|
| 46 |
+
"headline": string | null,
|
| 47 |
+
"summary": string | null,
|
| 48 |
+
"skills": string[],
|
| 49 |
+
"technical_skills": string[], <-- Put Programming Languages HERE
|
| 50 |
+
"education": [
|
| 51 |
+
{
|
| 52 |
+
"course": string | null,
|
| 53 |
+
"institution": string | null,
|
| 54 |
+
"year": string | null
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"work_experience": [
|
| 58 |
+
{
|
| 59 |
+
"role": string | null,
|
| 60 |
+
"company": string | null,
|
| 61 |
+
"years": string | null,
|
| 62 |
+
"duration": string | null,
|
| 63 |
+
"description": string | null
|
| 64 |
+
}
|
| 65 |
+
],
|
| 66 |
+
"projects": [
|
| 67 |
+
{
|
| 68 |
+
"title": "string | null",
|
| 69 |
+
"technologies_used": ["string"],
|
| 70 |
+
"description": string | null
|
| 71 |
+
}
|
| 72 |
+
],
|
| 73 |
+
"projects": [
|
| 74 |
+
{
|
| 75 |
+
"title": "string | null",
|
| 76 |
+
"technologies_used": ["string"],
|
| 77 |
+
"description": "string | null"
|
| 78 |
+
}
|
| 79 |
+
],
|
| 80 |
+
"certifications": string[],
|
| 81 |
+
"languages": string[],
|
| 82 |
+
"experience_years": integer | null
|
| 83 |
+
}
|
| 84 |
+
"current_position": string | null,
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def extract_resume_entities_gemini(text: str) -> dict:
|
| 89 |
+
max_retries = 3
|
| 90 |
+
|
| 91 |
+
for attempt in range(max_retries):
|
| 92 |
+
response = client.models.generate_content(
|
| 93 |
+
model="gemini-2.5-flash-lite",
|
| 94 |
+
contents=SYSTEM_PROMPT + "\n\nRESUME TEXT:\n" + text,
|
| 95 |
+
config=types.GenerateContentConfig(
|
| 96 |
+
temperature=0,
|
| 97 |
+
# 2. CRITICAL: This forces Gemini to return raw JSON without Markdown formatting
|
| 98 |
+
response_mime_type="application/json"
|
| 99 |
+
)
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
# 3. Clean the response just in case (removes accidental backticks)
|
| 104 |
+
cleaned_text = response.text.strip()
|
| 105 |
+
if cleaned_text.startswith("```json"):
|
| 106 |
+
cleaned_text = cleaned_text[7:]
|
| 107 |
+
if cleaned_text.startswith("```"):
|
| 108 |
+
cleaned_text = cleaned_text[3:]
|
| 109 |
+
if cleaned_text.endswith("```"):
|
| 110 |
+
cleaned_text = cleaned_text[:-3]
|
| 111 |
+
|
| 112 |
+
return json.loads(cleaned_text)
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
# Check if it's the "Overloaded" (503) error
|
| 116 |
+
error_str = str(e)
|
| 117 |
+
if "503" in error_str or "overloaded" in error_str.lower():
|
| 118 |
+
wait_time = 2 ** (attempt + 1) # Exponential backoff: 2s, 4s, 8s...
|
| 119 |
+
print(f"⚠️ Model overloaded. Retrying in {wait_time} seconds... (Attempt {attempt+1}/{max_retries})")
|
| 120 |
+
time.sleep(wait_time)
|
| 121 |
+
else:
|
| 122 |
+
# If it's a different error (like Auth), fail immediately
|
| 123 |
+
print(f"❌ Gemini Error: {e}")
|
| 124 |
+
return {}
|
| 125 |
+
|
| 126 |
+
except json.JSONDecodeError:
|
| 127 |
+
print(f"❌ JSON Decode Error. Raw response was: {response.text}")
|
| 128 |
+
raise ValueError("Gemini returned invalid JSON")
|
| 129 |
+
except Exception as e:
|
| 130 |
+
# Catch model overload or safety filter blocks
|
| 131 |
+
print(f"❌ Gemini Error: {e}")
|
| 132 |
+
return {}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def process_raw_resumes():
|
| 136 |
+
if not RAW_DIR.exists():
|
| 137 |
+
raise FileNotFoundError(f"Directory not found: {RAW_DIR}")
|
| 138 |
+
|
| 139 |
+
for file_path in RAW_DIR.iterdir():
|
| 140 |
+
if file_path.suffix.lower() not in [".pdf", ".docx", ".txt"]:
|
| 141 |
+
continue
|
| 142 |
+
|
| 143 |
+
print(f"\n📄 Processing: {file_path.name}")
|
| 144 |
+
|
| 145 |
+
try:
|
| 146 |
+
text = parse_file(str(file_path))
|
| 147 |
+
entities = extract_resume_entities_gemini(text)
|
| 148 |
+
|
| 149 |
+
print("✅ Extracted entities:")
|
| 150 |
+
print(entities)
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f"❌ Failed for {file_path.name}: {e}")
|
| 154 |
+
|
| 155 |
+
from src.extraction.fallback_extractor import extract_fallback
|
| 156 |
+
from src.preprocess.regex_pii import extract_contact_info_regex, mask_contact_info_regex
|
| 157 |
+
from src.preprocess.anonymizer import extract_name_and_mask
|
| 158 |
+
|
| 159 |
+
def process_single_resume(file_path: str) -> dict:
|
| 160 |
+
"""
|
| 161 |
+
Helper function for supabase_ingest.py to process a single downloaded file.
|
| 162 |
+
"""
|
| 163 |
+
text = ""
|
| 164 |
+
pii_data = {}
|
| 165 |
+
|
| 166 |
+
try:
|
| 167 |
+
# 1. Convert file path to string just in case
|
| 168 |
+
path_str = str(file_path)
|
| 169 |
+
|
| 170 |
+
# 2. Parse the text from the file (PDF/DOCX)
|
| 171 |
+
raw_text = parse_file(path_str)
|
| 172 |
+
|
| 173 |
+
# 3a. Privacy Step 1: Extract and Mask Contact Info (Regex)
|
| 174 |
+
print("🔒 [1/2] Masking Phone/Email/Links...")
|
| 175 |
+
pii_contact = extract_contact_info_regex(raw_text)
|
| 176 |
+
masked_text_v1 = mask_contact_info_regex(raw_text)
|
| 177 |
+
|
| 178 |
+
# 3b. Privacy Step 2: Extract and Mask Candidate Name (NER)
|
| 179 |
+
print("🔒 [2/2] Masking Names (NER)...")
|
| 180 |
+
ner_result = extract_name_and_mask(masked_text_v1)
|
| 181 |
+
final_masked_text = ner_result["masked_text"]
|
| 182 |
+
candidate_name = ner_result["candidate_name"]
|
| 183 |
+
|
| 184 |
+
# Merge PII Data
|
| 185 |
+
pii_data = pii_contact
|
| 186 |
+
pii_data["full_name"] = candidate_name
|
| 187 |
+
|
| 188 |
+
# Store masked text for error handling usage
|
| 189 |
+
text = final_masked_text
|
| 190 |
+
print(f"DEBUG: Final Masked Text Length: {len(text)}")
|
| 191 |
+
if len(text) < 50:
|
| 192 |
+
print("⚠️ WARNING: Masked text is suspiciously short!")
|
| 193 |
+
|
| 194 |
+
# 4. Send FINAL MASKED text to Gemini
|
| 195 |
+
print("🧠 Sending to Gemini...")
|
| 196 |
+
extracted = extract_resume_entities_gemini(final_masked_text)
|
| 197 |
+
|
| 198 |
+
# 5. Fallback if Gemini failed
|
| 199 |
+
if not extracted:
|
| 200 |
+
print("⚠️ Gemini returned empty. Using Regex Fallback.")
|
| 201 |
+
extracted = extract_fallback(final_masked_text)
|
| 202 |
+
|
| 203 |
+
# 6. Merge PII back into results (whether from Gemini or Fallback)
|
| 204 |
+
extracted.update(pii_data)
|
| 205 |
+
|
| 206 |
+
return extracted
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
print(f"❌ Error processing {file_path}: {e}")
|
| 210 |
+
# Final Fallback attempt
|
| 211 |
+
if text:
|
| 212 |
+
print("⚠️ Exception occurred. Using Regex Fallback on masked text.")
|
| 213 |
+
fallback_data = extract_fallback(text)
|
| 214 |
+
fallback_data.update(pii_data)
|
| 215 |
+
return fallback_data
|
| 216 |
+
return pii_data
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
if __name__ == "__main__":
|
| 220 |
+
process_raw_resumes()
|
backend/src/extraction/test_regex.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Add backend to path so we can import
|
| 5 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
|
| 6 |
+
|
| 7 |
+
from backend.src.extraction.fallback_extractor import extract_fallback
|
| 8 |
+
|
| 9 |
+
test_cases = [
|
| 10 |
+
"+91 9876543210", # India
|
| 11 |
+
"+919876543210", # India No Space
|
| 12 |
+
"9876543210", # India Local
|
| 13 |
+
"+1 212-555-0199", # US
|
| 14 |
+
"+44 7911 123456", # UK Mobile
|
| 15 |
+
"+971 50 1234567", # UAE
|
| 16 |
+
"+61 412 345 678", # Australia
|
| 17 |
+
"+49 151 12345678", # Germany
|
| 18 |
+
"+33 6 12 34 56 78", # France
|
| 19 |
+
"+81 90-1234-5678", # Japan
|
| 20 |
+
"Phone: +91 98765-43210", # In text
|
| 21 |
+
"Call me at 123-456-7890", # US Local in text
|
| 22 |
+
"No phone number here"
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
print("Testing extract_fallback Phone Extraction:")
|
| 27 |
+
with open("test_output.txt", "w", encoding="utf-8") as f:
|
| 28 |
+
for t in test_cases:
|
| 29 |
+
result = extract_fallback(t)
|
| 30 |
+
output_line = f"'{t}' -> {result.get('phone')}"
|
| 31 |
+
print(output_line)
|
| 32 |
+
f.write(output_line + "\n")
|
| 33 |
+
|
backend/src/ingestion/__init__.py
ADDED
|
File without changes
|
backend/src/ingestion/docx_reader.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from docx import Document
|
| 2 |
+
from src.preprocess.cleaner import postprocess_extracted_text
|
| 3 |
+
|
| 4 |
+
def parse_docx(path: str) -> str:
|
| 5 |
+
"""
|
| 6 |
+
Extract text from DOCX file.
|
| 7 |
+
Returns postprocessed text ready for NER and cleaning.
|
| 8 |
+
"""
|
| 9 |
+
doc = Document(path)
|
| 10 |
+
text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
|
| 11 |
+
return postprocess_extracted_text(text)
|
backend/src/ingestion/parser.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from .pdf_reader import parse_pdf
|
| 3 |
+
from .docx_reader import parse_docx
|
| 4 |
+
from src.preprocess.cleaner import postprocess_extracted_text
|
| 5 |
+
from src.preprocess.cleaner import clean_text
|
| 6 |
+
from src.preprocess.anonymizer import remove_pii
|
| 7 |
+
|
| 8 |
+
def parse_file(path: str) -> str:
|
| 9 |
+
"""Detect file type and parse accordingly."""
|
| 10 |
+
ext = os.path.splitext(path)[1].lower()
|
| 11 |
+
if ext == ".pdf":
|
| 12 |
+
text = parse_pdf(path)
|
| 13 |
+
elif ext == ".docx":
|
| 14 |
+
text = parse_docx(path)
|
| 15 |
+
elif ext == ".txt":
|
| 16 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 17 |
+
text = f.read()
|
| 18 |
+
else:
|
| 19 |
+
raise ValueError(f"Unsupported file type: {ext}")
|
| 20 |
+
|
| 21 |
+
return postprocess_extracted_text(remove_pii(clean_text(text)))
|
backend/src/ingestion/pdf_reader.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pypdf
|
| 2 |
+
import pdfplumber
|
| 3 |
+
from src.preprocess.cleaner import postprocess_extracted_text
|
| 4 |
+
|
| 5 |
+
def parse_pdf(path: str) -> str:
|
| 6 |
+
"""
|
| 7 |
+
Extract text from a PDF file.
|
| 8 |
+
Tries pdfplumber first, falls back to pypdf.
|
| 9 |
+
Returns postprocessed text.
|
| 10 |
+
"""
|
| 11 |
+
text = ""
|
| 12 |
+
|
| 13 |
+
# --- pdfplumber extraction ---
|
| 14 |
+
try:
|
| 15 |
+
with pdfplumber.open(path) as pdf:
|
| 16 |
+
for page in pdf.pages:
|
| 17 |
+
page_text = page.extract_text()
|
| 18 |
+
if page_text:
|
| 19 |
+
text += page_text + "\n"
|
| 20 |
+
if text.strip():
|
| 21 |
+
return postprocess_extracted_text(text)
|
| 22 |
+
except Exception as e:
|
| 23 |
+
print(f"⚠️ pdfplumber failed for {path}: {e}")
|
| 24 |
+
|
| 25 |
+
# --- fallback to pypdf ---
|
| 26 |
+
try:
|
| 27 |
+
with open(path, "rb") as f:
|
| 28 |
+
reader = pypdf.PdfReader(f)
|
| 29 |
+
for page in reader.pages:
|
| 30 |
+
page_text = page.extract_text()
|
| 31 |
+
if page_text:
|
| 32 |
+
text += page_text + "\n"
|
| 33 |
+
if text.strip():
|
| 34 |
+
return postprocess_extracted_text(text)
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"❌ pypdf also failed for {path}: {e}")
|
| 37 |
+
|
| 38 |
+
raise ValueError(f"Unable to extract text from PDF: {path}")
|
backend/src/matching/__init__.py
ADDED
|
File without changes
|
backend/src/matching/similarity.py
ADDED
|
File without changes
|