Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

Nihal D'Souza commited on May 28, 2022

Commit

1fdb52f

•

1 Parent(s): a804ced

This commit fixes the extractive error

Files changed (2) hide show

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ if summarization_type == 'Abstractive':
     st.sidebar.caption('Summary will be generated by the T5 Transformer Model')
 elif summarization_type == 'Extractive':
     st.sidebar.caption('Summary will be generated by a custom TextRank Algorithm')
-    summary_len = st.sidebar.slider('Summary length percentage', 1, 10, 3)
 elif summarization_type == 'Both':
     st.sidebar.caption('The License text will be first passed through the custom TextRank algorithm and then passed on to the T5 Transformer Model to generate a summary.')
@@ -41,14 +41,10 @@ if len(input) > 0:
         if summarization_type == 'Abstractive':
             summary, definitions = summarize_text_with_model(input, model, tokenizer)
         if summarization_type == 'Extractive':
-                summary, definitions = custom_textrank_summarizer(input, summary_len = summary_len/10)
         if summarization_type == 'Both':
             summary, definitions = summarize_text_with_model(input, model, tokenizer)
             summary, _ = custom_textrank_summarizer(summary, summary_len = 1)
-        if clean_text:
-            st.header('Cleaned License Text')
-            st.write(clean_license_text(input)[0])
         st.header('Summary')
         st.write(summary)
@@ -59,5 +55,9 @@ if len(input) > 0:
         if definitions:
             st.header('Definitions')
-            st.write(definitions)

     st.sidebar.caption('Summary will be generated by the T5 Transformer Model')
 elif summarization_type == 'Extractive':
     st.sidebar.caption('Summary will be generated by a custom TextRank Algorithm')
+    summary_len = st.sidebar.slider('Summary length percentage', 1, 100, 30)
 elif summarization_type == 'Both':
     st.sidebar.caption('The License text will be first passed through the custom TextRank algorithm and then passed on to the T5 Transformer Model to generate a summary.')
         if summarization_type == 'Abstractive':
             summary, definitions = summarize_text_with_model(input, model, tokenizer)
         if summarization_type == 'Extractive':
+                summary, definitions = custom_textrank_summarizer(input, summary_len = summary_len/100)
         if summarization_type == 'Both':
             summary, definitions = summarize_text_with_model(input, model, tokenizer)
             summary, _ = custom_textrank_summarizer(summary, summary_len = 1)
         st.header('Summary')
         st.write(summary)
         if definitions:
             st.header('Definitions')
+            st.write(definitions)
+        if clean_text:
+            st.header('Cleaned License Text')
+            st.write(clean_license_text(input)[0])

src/textrank.py CHANGED Viewed

@@ -8,8 +8,6 @@ from collections import Counter
 from src.clean import clean_license_text
 from src.read_data import read_file
-nltk.download('punkt')
 properties_dict = {
     "modify":['modify', 'modification', 'change'],
     "distribute":['distribute', 'distribution'],
@@ -37,14 +35,19 @@ def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, de
     '''
     TODO: Doctrings
     '''
-    summary_len = math.ceil(summary_len*len(license_text.split('.')))
     sent_scores = {}
     cleaned_license_text, definitions = clean_license_text(license_text)
-    for i in cleaned_license_text.split('.'):
         if debug:
             print(i.split())
         if len(i.split()) < min_sent_len:
-            break
         score = 0
         for prop, prop_words in properties_dict.items():
             prop_score = 0
@@ -52,7 +55,7 @@ def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, de
             word_count = Counter([tok for tok in lemmatized_tokens])
             for prop_word in prop_words:
                 if prop_word in word_count.keys():
-                    prop_score += properties_scores[prop_word]
             if debug:
                 print(prop, "=", prop_score)
             score += prop_score

 from src.clean import clean_license_text
 from src.read_data import read_file
 properties_dict = {
     "modify":['modify', 'modification', 'change'],
     "distribute":['distribute', 'distribution'],
     '''
     TODO: Doctrings
     '''
     sent_scores = {}
     cleaned_license_text, definitions = clean_license_text(license_text)
+    cleaned_license_sentences = cleaned_license_text.split('.')
+    summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
+    if debug:
+        print(f'summary length:{summary_len}')
+    if debug:
+        print(cleaned_license_sentences)
+    for i in cleaned_license_sentences:
         if debug:
             print(i.split())
         if len(i.split()) < min_sent_len:
+            continue
         score = 0
         for prop, prop_words in properties_dict.items():
             prop_score = 0
             word_count = Counter([tok for tok in lemmatized_tokens])
             for prop_word in prop_words:
                 if prop_word in word_count.keys():
+                    prop_score += properties_scores[prop]
             if debug:
                 print(prop, "=", prop_score)
             score += prop_score