Nihal D'Souza commited on
Commit
1fdb52f
1 Parent(s): a804ced

This commit fixes the extractive error

Browse files
Files changed (2) hide show
  1. app.py +7 -7
  2. src/textrank.py +9 -6
app.py CHANGED
@@ -27,7 +27,7 @@ if summarization_type == 'Abstractive':
27
  st.sidebar.caption('Summary will be generated by the T5 Transformer Model')
28
  elif summarization_type == 'Extractive':
29
  st.sidebar.caption('Summary will be generated by a custom TextRank Algorithm')
30
- summary_len = st.sidebar.slider('Summary length percentage', 1, 10, 3)
31
  elif summarization_type == 'Both':
32
  st.sidebar.caption('The License text will be first passed through the custom TextRank algorithm and then passed on to the T5 Transformer Model to generate a summary.')
33
 
@@ -41,14 +41,10 @@ if len(input) > 0:
41
  if summarization_type == 'Abstractive':
42
  summary, definitions = summarize_text_with_model(input, model, tokenizer)
43
  if summarization_type == 'Extractive':
44
- summary, definitions = custom_textrank_summarizer(input, summary_len = summary_len/10)
45
  if summarization_type == 'Both':
46
  summary, definitions = summarize_text_with_model(input, model, tokenizer)
47
  summary, _ = custom_textrank_summarizer(summary, summary_len = 1)
48
-
49
- if clean_text:
50
- st.header('Cleaned License Text')
51
- st.write(clean_license_text(input)[0])
52
 
53
  st.header('Summary')
54
  st.write(summary)
@@ -59,5 +55,9 @@ if len(input) > 0:
59
 
60
  if definitions:
61
  st.header('Definitions')
62
- st.write(definitions)
 
 
 
 
63
 
 
27
  st.sidebar.caption('Summary will be generated by the T5 Transformer Model')
28
  elif summarization_type == 'Extractive':
29
  st.sidebar.caption('Summary will be generated by a custom TextRank Algorithm')
30
+ summary_len = st.sidebar.slider('Summary length percentage', 1, 100, 30)
31
  elif summarization_type == 'Both':
32
  st.sidebar.caption('The License text will be first passed through the custom TextRank algorithm and then passed on to the T5 Transformer Model to generate a summary.')
33
 
 
41
  if summarization_type == 'Abstractive':
42
  summary, definitions = summarize_text_with_model(input, model, tokenizer)
43
  if summarization_type == 'Extractive':
44
+ summary, definitions = custom_textrank_summarizer(input, summary_len = summary_len/100)
45
  if summarization_type == 'Both':
46
  summary, definitions = summarize_text_with_model(input, model, tokenizer)
47
  summary, _ = custom_textrank_summarizer(summary, summary_len = 1)
 
 
 
 
48
 
49
  st.header('Summary')
50
  st.write(summary)
 
55
 
56
  if definitions:
57
  st.header('Definitions')
58
+ st.write(definitions)
59
+
60
+ if clean_text:
61
+ st.header('Cleaned License Text')
62
+ st.write(clean_license_text(input)[0])
63
 
src/textrank.py CHANGED
@@ -8,8 +8,6 @@ from collections import Counter
8
  from src.clean import clean_license_text
9
  from src.read_data import read_file
10
 
11
- nltk.download('punkt')
12
-
13
  properties_dict = {
14
  "modify":['modify', 'modification', 'change'],
15
  "distribute":['distribute', 'distribution'],
@@ -37,14 +35,19 @@ def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, de
37
  '''
38
  TODO: Doctrings
39
  '''
40
- summary_len = math.ceil(summary_len*len(license_text.split('.')))
41
  sent_scores = {}
42
  cleaned_license_text, definitions = clean_license_text(license_text)
43
- for i in cleaned_license_text.split('.'):
 
 
 
 
 
 
44
  if debug:
45
  print(i.split())
46
  if len(i.split()) < min_sent_len:
47
- break
48
  score = 0
49
  for prop, prop_words in properties_dict.items():
50
  prop_score = 0
@@ -52,7 +55,7 @@ def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, de
52
  word_count = Counter([tok for tok in lemmatized_tokens])
53
  for prop_word in prop_words:
54
  if prop_word in word_count.keys():
55
- prop_score += properties_scores[prop_word]
56
  if debug:
57
  print(prop, "=", prop_score)
58
  score += prop_score
 
8
  from src.clean import clean_license_text
9
  from src.read_data import read_file
10
 
 
 
11
  properties_dict = {
12
  "modify":['modify', 'modification', 'change'],
13
  "distribute":['distribute', 'distribution'],
 
35
  '''
36
  TODO: Doctrings
37
  '''
 
38
  sent_scores = {}
39
  cleaned_license_text, definitions = clean_license_text(license_text)
40
+ cleaned_license_sentences = cleaned_license_text.split('.')
41
+ summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
42
+ if debug:
43
+ print(f'summary length:{summary_len}')
44
+ if debug:
45
+ print(cleaned_license_sentences)
46
+ for i in cleaned_license_sentences:
47
  if debug:
48
  print(i.split())
49
  if len(i.split()) < min_sent_len:
50
+ continue
51
  score = 0
52
  for prop, prop_words in properties_dict.items():
53
  prop_score = 0
 
55
  word_count = Counter([tok for tok in lemmatized_tokens])
56
  for prop_word in prop_words:
57
  if prop_word in word_count.keys():
58
+ prop_score += properties_scores[prop]
59
  if debug:
60
  print(prop, "=", prop_score)
61
  score += prop_score