manasvinid commited on
Commit
8981128
1 Parent(s): 04ff643

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -3
app.py CHANGED
@@ -27,6 +27,9 @@ st.header('Input')
27
  jobs_data= job_desc_pdf()
28
  resume_data= resume_pdf()
29
 
 
 
 
30
 
31
  # setup_nltk_resources()
32
 
@@ -40,30 +43,69 @@ jobs_data['processed_description'] = jobs_data['description'].apply(preprocess_t
40
  jobs_data_cleaned = drop_duplicates(jobs_data, column_name='description')
41
 
42
 
 
43
  resume_data['processed_resume'] = resume_data['Resume'].apply(preprocess_text)
44
  resume_data_cleaned = drop_duplicates(resume_data, column_name='Resume')
45
 
 
 
 
 
 
46
  jobs_data_cleaned_with_tokens = add_token_count_column(jobs_data_cleaned, column_name='processed_description')
47
  resume_data_cleaned_with_tokens = add_token_count_column(resume_data_cleaned, column_name='processed_resume')
48
 
 
 
 
49
  # Dropping unnecessary columns from jobs data
50
  jobs_data_final = jobs_data_cleaned_with_tokens[['processed_description', 'token_count']]
51
 
52
  # Dropping unnecessary columns from resume data
53
  resume_data_final = resume_data_cleaned_with_tokens[['processed_resume', 'token_count']]
54
 
 
 
 
55
 
56
  summarizer = TextSummarizer("geekradius/bart-large-cnn-fintetuned-samsum-repo")
57
 
58
- # Summarize the top 100 'processed_description' of jobs_data_final
59
- top_jobs_data = jobs_data_final.head(100)
60
 
61
  # Summariz jobs description
62
- jobs_data_summarized = batch_summarize(top_jobs_data, 'processed_description', summarizer, batch_size=10, output_col='summarized_description')
63
 
64
  # Summarize all 'processed_resume' in resume_data_final
65
  resume_data_summarized = batch_summarize(resume_data_final, 'processed_resume', summarizer, batch_size=10, output_col='summarized_resume')
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
 
 
27
  jobs_data= job_desc_pdf()
28
  resume_data= resume_pdf()
29
 
30
+ st.write('input to df:')
31
+ st.write(jobs_data)
32
+ st.write(resume_data)
33
 
34
  # setup_nltk_resources()
35
 
 
43
  jobs_data_cleaned = drop_duplicates(jobs_data, column_name='description')
44
 
45
 
46
+
47
  resume_data['processed_resume'] = resume_data['Resume'].apply(preprocess_text)
48
  resume_data_cleaned = drop_duplicates(resume_data, column_name='Resume')
49
 
50
+ st.write("CLEANED")
51
+ st.write(jobs_data_cleaned)
52
+ st.write(resume_data_cleaned)
53
+
54
+
55
  jobs_data_cleaned_with_tokens = add_token_count_column(jobs_data_cleaned, column_name='processed_description')
56
  resume_data_cleaned_with_tokens = add_token_count_column(resume_data_cleaned, column_name='processed_resume')
57
 
58
+
59
+
60
+
61
  # Dropping unnecessary columns from jobs data
62
  jobs_data_final = jobs_data_cleaned_with_tokens[['processed_description', 'token_count']]
63
 
64
  # Dropping unnecessary columns from resume data
65
  resume_data_final = resume_data_cleaned_with_tokens[['processed_resume', 'token_count']]
66
 
67
+ st.write("CLEANED WITH TOKENS")
68
+ st.write(jobs_data_final)
69
+ st.write(resume_data_final)
70
 
71
  summarizer = TextSummarizer("geekradius/bart-large-cnn-fintetuned-samsum-repo")
72
 
 
 
73
 
74
  # Summariz jobs description
75
+ jobs_data_summarized = batch_summarize(jobs_data_final, 'processed_description', summarizer, batch_size=10, output_col='summarized_description')
76
 
77
  # Summarize all 'processed_resume' in resume_data_final
78
  resume_data_summarized = batch_summarize(resume_data_final, 'processed_resume', summarizer, batch_size=10, output_col='summarized_resume')
79
 
80
 
81
+ # Example Usage
82
+ encoder = SentenceTransformerEncoder(model_name='all-MiniLM-L6-v2')
83
+
84
+ # Encoding the summarized job descriptions
85
+ jobs_data_summarized_and_encoded = encoder.encode_column(jobs_data_summarized, 'summarized_description')
86
+
87
+ # Encoding the summarized resumes
88
+ resume_data_summarized_and_encoded = encoder.encode_column(resume_data_summarized, 'summarized_resume')
89
+
90
+
91
+ # Combine the jobs data
92
+ jobs_combined = pd.merge(
93
+ jobs_data_final,
94
+ jobs_data_summarized_and_encoded[['summarized_description', 'summarized_description_encoded']],
95
+ left_index=True, right_index=True)
96
+
97
+ # Combine the resume data
98
+ resume_combined = pd.merge(
99
+ resume_data_final,
100
+ resume_data_summarized_and_encoded[['summarized_resume', 'summarized_resume_encoded']],
101
+ left_index=True, right_index=True)
102
+
103
+ # Reset index of DataFrame
104
+ jobs_combined.reset_index(drop=True, inplace=True)
105
+ resume_combined.reset_index(drop=True, inplace=True)
106
+
107
+
108
+
109
+
110
 
111