Nihal D'Souza commited on
Commit
a804ced
1 Parent(s): 0599777

Custom textrank, changes to UI

Browse files
app.py CHANGED
@@ -1,13 +1,13 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
  import nltk
5
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
6
  import torch
7
- import os
8
 
9
  from src.doc2vec import inference
10
  from src.abstractive_sum import summarize_text_with_model
 
 
11
 
12
  CUSTOM_MODEL_NAME = "utkarshsaboo45/ClearlyDefinedLicenseSummarizer"
13
 
@@ -19,12 +19,37 @@ with st.spinner('Loading...'):
19
  model = AutoModelForSeq2SeqLM.from_pretrained(CUSTOM_MODEL_NAME).to(device)
20
  tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_NAME)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  st.title('Clearly Defined: License Summarizer')
23
  input = st.text_area('Enter contents of the license')
24
 
25
  if len(input) > 0:
26
  with st.spinner('Loading...'):
27
- summary = summarize_text_with_model(input, model, tokenizer)
 
 
 
 
 
 
 
 
 
 
 
28
  st.header('Summary')
29
  st.write(summary)
30
 
@@ -32,3 +57,7 @@ if len(input) > 0:
32
  st.header('Similarity Index')
33
  st.dataframe(prediction_scores)
34
 
 
 
 
 
 
1
+ import os
 
 
2
  import nltk
3
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  import torch
5
+ import streamlit as st
6
 
7
  from src.doc2vec import inference
8
  from src.abstractive_sum import summarize_text_with_model
9
+ from src.textrank import custom_textrank_summarizer
10
+ from src.clean import clean_license_text
11
 
12
  CUSTOM_MODEL_NAME = "utkarshsaboo45/ClearlyDefinedLicenseSummarizer"
13
 
 
19
  model = AutoModelForSeq2SeqLM.from_pretrained(CUSTOM_MODEL_NAME).to(device)
20
  tokenizer = AutoTokenizer.from_pretrained(CUSTOM_MODEL_NAME)
21
 
22
+ summarization_type = st.sidebar.selectbox(
23
+ "Select summarization type.",
24
+ ("Abstractive", "Extractive", "Both")
25
+ )
26
+ if summarization_type == 'Abstractive':
27
+ st.sidebar.caption('Summary will be generated by the T5 Transformer Model')
28
+ elif summarization_type == 'Extractive':
29
+ st.sidebar.caption('Summary will be generated by a custom TextRank Algorithm')
30
+ summary_len = st.sidebar.slider('Summary length percentage', 1, 10, 3)
31
+ elif summarization_type == 'Both':
32
+ st.sidebar.caption('The License text will be first passed through the custom TextRank algorithm and then passed on to the T5 Transformer Model to generate a summary.')
33
+
34
+ clean_text = st.sidebar.checkbox('Show cleaned license text')
35
+
36
  st.title('Clearly Defined: License Summarizer')
37
  input = st.text_area('Enter contents of the license')
38
 
39
  if len(input) > 0:
40
  with st.spinner('Loading...'):
41
+ if summarization_type == 'Abstractive':
42
+ summary, definitions = summarize_text_with_model(input, model, tokenizer)
43
+ if summarization_type == 'Extractive':
44
+ summary, definitions = custom_textrank_summarizer(input, summary_len = summary_len/10)
45
+ if summarization_type == 'Both':
46
+ summary, definitions = summarize_text_with_model(input, model, tokenizer)
47
+ summary, _ = custom_textrank_summarizer(summary, summary_len = 1)
48
+
49
+ if clean_text:
50
+ st.header('Cleaned License Text')
51
+ st.write(clean_license_text(input)[0])
52
+
53
  st.header('Summary')
54
  st.write(summary)
55
 
 
57
  st.header('Similarity Index')
58
  st.dataframe(prediction_scores)
59
 
60
+ if definitions:
61
+ st.header('Definitions')
62
+ st.write(definitions)
63
+
data/choosealicense_appendix_labels.csv CHANGED
@@ -1,42 +1,42 @@
1
- spdx_id,license_name,commercial-use,disclose-source,distribution,document-changes,include-copyright,include-copyright--source,liability,modifications,network-use-disclose,patent-use,private-use,same-license,same-license--file,same-license--library,trademark-use,warranty,GTLC_Permissive,GTLC_Notes
2
- 0bsd,BSD Zero Clause License,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations,permissive,
3
- afl-3.0,Academic Free License v3.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations,permissive,
4
- agpl-3.0,GNU Affero General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,,limitations,not_permissive,
5
- apache-2.0,Apache License 2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations,permissive,
6
- artistic-2.0,Artistic License 2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations,not_permissive,
7
- bsd-2-clause,BSD 2-Clause Simplified License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations,permissive,
8
- bsd-3-clause,BSD 3-Clause New or Revised License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations,permissive,
9
- bsd-3-clause-clear,BSD 3-Clause Clear License,permissions,,permissions,,conditions,,limitations,permissions,,limitations,permissions,,,,,limitations,permissive,
10
- bsd-4-clause,BSD 4-Clause Original or Old License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations,permissive,
11
- bsl-1.0,Boost Software License 1.0,permissions,,permissions,,,conditions,limitations,permissions,,,permissions,,,,,limitations,permissive,
12
- cc-by-4.0,Creative Commons Attribution 4.0 International,permissions,,permissions,conditions,conditions,,limitations,permissions,,limitations,permissions,,,,limitations,limitations,permissive,
13
- cc-by-sa-4.0,Creative Commons Attribution Share Alike 4.0 International,permissions,,permissions,conditions,conditions,,limitations,permissions,,limitations,permissions,conditions,,,limitations,limitations,not_permissive,
14
- cc0-1.0,Creative Commons Zero v1.0 Universal,permissions,,permissions,,,,limitations,permissions,,limitations,permissions,,,,limitations,limitations,permissive,
15
- cecill-2.1,CeCILL Free Software License Agreement v2.1,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations,not_permissive,
16
- ecl-2.0,Educational Community License v2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations,,unfamiliar
17
- epl-1.0,Eclipse Public License 1.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations,not_permissive,weak copyleft
18
- epl-2.0,Eclipse Public License 2.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations,not_permissive,weak copyleft
19
- eupl-1.1,European Union Public License 1.1,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations,not_permissive,
20
- eupl-1.2,European Union Public License 1.2,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations,not_permissive,
21
- gpl-2.0,GNU General Public License v2.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,conditions,,,,limitations,not_permissive,
22
- gpl-3.0,GNU General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations,not_permissive,
23
- isc,ISC License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations,permissive,
24
- lgpl-2.1,GNU Lesser General Public License v2.1,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,,,conditions,,limitations,not_permissive,
25
- lgpl-3.0,GNU Lesser General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,conditions,,limitations,not_permissive,
26
- lppl-1.3c,LaTeX Project Public License v1.3c,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,,,,,limitations,,unfamiliar
27
- mit,MIT License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations,permissive,
28
- mit-0,MIT No Attribution,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations,permissive,
29
- mpl-2.0,Mozilla Public License 2.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,,conditions,,limitations,limitations,not_permissive,weak copyleft
30
- ms-pl,Microsoft Public License,permissions,,permissions,,conditions,,,permissions,,permissions,permissions,,,,limitations,limitations,not_permissive,
31
- ms-rl,Microsoft Reciprocal License,permissions,conditions,permissions,,conditions,,,permissions,,permissions,permissions,,conditions,,limitations,limitations,not_permissive,
32
- mulanpsl-2.0,"Mulan Permissive Software License, Version 2",permissions,,permissions,,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations,,unfamiliar
33
- ncsa,University of IllinoisNCSA Open Source License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations,permissive,
34
- odbl-1.0,Open Data Commons Open Database License v1.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,limitations,permissions,conditions,,,limitations,limitations,,unfamiliar
35
- ofl-1.1,SIL Open Font License 1.1,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,conditions,,,,limitations,not_permissive,
36
- osl-3.0,Open Software License 3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations,not_permissive,
37
- postgresql,PostgreSQL License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations,permissive,
38
- unlicense,The Unlicense,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations,permissive,
39
- upl-1.0,Universal Permissive License v1.0,permissions,,permissions,,conditions,,limitations,permissions,,permissions,permissions,,,,,limitations,permissive,
40
- vim,Vim License,permissions,conditions,permissions,conditions,conditions,,,permissions,,,permissions,conditions,,,,,not_permissive,
41
- wtfpl,Do What The Fck You Want To Public License,permissions,,permissions,,,,,permissions,,,permissions,,,,,,,unfamiliar
42
- zlib,zlib License,permissions,,permissions,conditions,,conditions,limitations,permissions,,,permissions,,,,,limitations,permissive,
 
1
+ spdx_id,license_name,commercial-use,disclose-source,distribution,document-changes,include-copyright,include-copyright--source,liability,modifications,network-use-disclose,patent-use,private-use,same-license,same-license--file,same-license--library,trademark-use,warranty
2
+ 0bsd,BSD Zero Clause License,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
3
+ afl-3.0,Academic Free License v3.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
4
+ agpl-3.0,GNU Affero General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,,limitations
5
+ apache-2.0,Apache License 2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
6
+ artistic-2.0,Artistic License 2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
7
+ bsd-2-clause,BSD 2-Clause Simplified License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
8
+ bsd-3-clause,BSD 3-Clause New or Revised License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
9
+ bsd-3-clause-clear,BSD 3-Clause Clear License,permissions,,permissions,,conditions,,limitations,permissions,,limitations,permissions,,,,,limitations
10
+ bsd-4-clause,BSD 4-Clause Original or Old License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
11
+ bsl-1.0,Boost Software License 1.0,permissions,,permissions,,,conditions,limitations,permissions,,,permissions,,,,,limitations
12
+ cc-by-4.0,Creative Commons Attribution 4.0 International,permissions,,permissions,conditions,conditions,,limitations,permissions,,limitations,permissions,,,,limitations,limitations
13
+ cc-by-sa-4.0,Creative Commons Attribution Share Alike 4.0 International,permissions,,permissions,conditions,conditions,,limitations,permissions,,limitations,permissions,conditions,,,limitations,limitations
14
+ cc0-1.0,Creative Commons Zero v1.0 Universal,permissions,,permissions,,,,limitations,permissions,,limitations,permissions,,,,limitations,limitations
15
+ cecill-2.1,CeCILL Free Software License Agreement v2.1,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
16
+ ecl-2.0,Educational Community License v2.0,permissions,,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
17
+ epl-1.0,Eclipse Public License 1.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
18
+ epl-2.0,Eclipse Public License 2.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
19
+ eupl-1.1,European Union Public License 1.1,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
20
+ eupl-1.2,European Union Public License 1.2,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
21
+ gpl-2.0,GNU General Public License v2.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,conditions,,,,limitations
22
+ gpl-3.0,GNU General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,conditions,,,,limitations
23
+ isc,ISC License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
24
+ lgpl-2.1,GNU Lesser General Public License v2.1,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,,,conditions,,limitations
25
+ lgpl-3.0,GNU Lesser General Public License v3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,permissions,permissions,,,conditions,,limitations
26
+ lppl-1.3c,LaTeX Project Public License v1.3c,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,,,permissions,,,,,limitations
27
+ mit,MIT License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
28
+ mit-0,MIT No Attribution,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
29
+ mpl-2.0,Mozilla Public License 2.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,permissions,permissions,,conditions,,limitations,limitations
30
+ ms-pl,Microsoft Public License,permissions,,permissions,,conditions,,,permissions,,permissions,permissions,,,,limitations,limitations
31
+ ms-rl,Microsoft Reciprocal License,permissions,conditions,permissions,,conditions,,,permissions,,permissions,permissions,,conditions,,limitations,limitations
32
+ mulanpsl-2.0,"Mulan Permissive Software License, Version 2",permissions,,permissions,,conditions,,limitations,permissions,,permissions,permissions,,,,limitations,limitations
33
+ ncsa,University of IllinoisNCSA Open Source License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
34
+ odbl-1.0,Open Data Commons Open Database License v1.0,permissions,conditions,permissions,,conditions,,limitations,permissions,,limitations,permissions,conditions,,,limitations,limitations
35
+ ofl-1.1,SIL Open Font License 1.1,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,conditions,,,,limitations
36
+ osl-3.0,Open Software License 3.0,permissions,conditions,permissions,conditions,conditions,,limitations,permissions,conditions,permissions,permissions,conditions,,,limitations,limitations
37
+ postgresql,PostgreSQL License,permissions,,permissions,,conditions,,limitations,permissions,,,permissions,,,,,limitations
38
+ unlicense,The Unlicense,permissions,,permissions,,,,limitations,permissions,,,permissions,,,,,limitations
39
+ upl-1.0,Universal Permissive License v1.0,permissions,,permissions,,conditions,,limitations,permissions,,permissions,permissions,,,,,limitations
40
+ vim,Vim License,permissions,conditions,permissions,conditions,conditions,,,permissions,,,permissions,conditions,,,,
41
+ wtfpl,Do What The Fck You Want To Public License,permissions,,permissions,,,,,permissions,,,permissions,,,,,
42
+ zlib,zlib License,permissions,,permissions,conditions,,conditions,limitations,permissions,,,permissions,,,,,limitations
requirements.txt CHANGED
@@ -10,15 +10,19 @@ base58==2.1.1
10
  beautifulsoup4==4.11.1
11
  bleach==5.0.0
12
  blinker==1.4
 
13
  cachetools==5.1.0
 
14
  certifi==2021.10.8
15
  cffi==1.15.0
16
  charset-normalizer==2.0.12
17
  click==8.0.4
18
  cycler==0.11.0
 
19
  debugpy==1.6.0
20
  decorator==5.1.1
21
  defusedxml==0.7.1
 
22
  entrypoints==0.4
23
  executing==0.8.3
24
  fastjsonschema==2.15.3
@@ -43,10 +47,12 @@ jupyter-core==4.10.0
43
  jupyterlab-pygments==0.2.2
44
  jupyterlab-widgets==1.1.0
45
  kiwisolver==1.4.2
 
46
  MarkupSafe==2.1.1
47
  matplotlib==3.5.0
48
  matplotlib-inline==0.1.3
49
  mistune==0.8.4
 
50
  nbclient==0.6.3
51
  nbconvert==6.5.0
52
  nbformat==5.4.0
@@ -58,10 +64,12 @@ packaging==21.3
58
  pandas==1.3.4
59
  pandocfilters==1.5.0
60
  parso==0.8.3
 
61
  pexpect==4.8.0
62
  pickleshare==0.7.5
63
  Pillow==9.1.1
64
  pip==22.1
 
65
  prometheus-client==0.14.1
66
  prompt-toolkit==3.0.29
67
  protobuf==3.20.1
@@ -70,6 +78,7 @@ ptyprocess==0.7.0
70
  pure-eval==0.2.2
71
  pyarrow==8.0.0
72
  pycparser==2.21
 
73
  pydeck==0.7.1
74
  Pygments==2.12.0
75
  Pympler==1.0.1
@@ -90,13 +99,18 @@ setuptools==62.3.1
90
  setuptools-scm==6.4.2
91
  six==1.16.0
92
  sklearn==0.0
93
- smart-open==6.0.0
94
  smmap==5.0.0
95
  soupsieve==2.3.2.post1
 
 
 
 
96
  stack-data==0.2.0
97
  streamlit==1.9.0
98
  striprtf==0.0.20
99
  terminado==0.15.0
 
100
  threadpoolctl==3.1.0
101
  tinycss2==1.1.1
102
  tokenizers==0.12.1
@@ -108,11 +122,13 @@ tornado==6.1
108
  tqdm==4.64.0
109
  traitlets==5.2.1.post0
110
  transformers==4.19.2
 
111
  typing_extensions==4.2.0
112
  tzdata==2022.1
113
  tzlocal==4.2
114
  urllib3==1.26.9
115
  validators==0.19.0
 
116
  watchdog==2.1.8
117
  wcwidth==0.2.5
118
  webencodings==0.5.1
 
10
  beautifulsoup4==4.11.1
11
  bleach==5.0.0
12
  blinker==1.4
13
+ blis==0.7.7
14
  cachetools==5.1.0
15
+ catalogue==2.0.7
16
  certifi==2021.10.8
17
  cffi==1.15.0
18
  charset-normalizer==2.0.12
19
  click==8.0.4
20
  cycler==0.11.0
21
+ cymem==2.0.6
22
  debugpy==1.6.0
23
  decorator==5.1.1
24
  defusedxml==0.7.1
25
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl
26
  entrypoints==0.4
27
  executing==0.8.3
28
  fastjsonschema==2.15.3
 
47
  jupyterlab-pygments==0.2.2
48
  jupyterlab-widgets==1.1.0
49
  kiwisolver==1.4.2
50
+ langcodes==3.3.0
51
  MarkupSafe==2.1.1
52
  matplotlib==3.5.0
53
  matplotlib-inline==0.1.3
54
  mistune==0.8.4
55
+ murmurhash==1.0.7
56
  nbclient==0.6.3
57
  nbconvert==6.5.0
58
  nbformat==5.4.0
 
64
  pandas==1.3.4
65
  pandocfilters==1.5.0
66
  parso==0.8.3
67
+ pathy==0.6.1
68
  pexpect==4.8.0
69
  pickleshare==0.7.5
70
  Pillow==9.1.1
71
  pip==22.1
72
+ preshed==3.0.6
73
  prometheus-client==0.14.1
74
  prompt-toolkit==3.0.29
75
  protobuf==3.20.1
 
78
  pure-eval==0.2.2
79
  pyarrow==8.0.0
80
  pycparser==2.21
81
+ pydantic==1.8.2
82
  pydeck==0.7.1
83
  Pygments==2.12.0
84
  Pympler==1.0.1
 
99
  setuptools-scm==6.4.2
100
  six==1.16.0
101
  sklearn==0.0
102
+ smart-open==5.2.1
103
  smmap==5.0.0
104
  soupsieve==2.3.2.post1
105
+ spacy==3.3.0
106
+ spacy-legacy==3.0.9
107
+ spacy-loggers==1.0.2
108
+ srsly==2.4.3
109
  stack-data==0.2.0
110
  streamlit==1.9.0
111
  striprtf==0.0.20
112
  terminado==0.15.0
113
+ thinc==8.0.16
114
  threadpoolctl==3.1.0
115
  tinycss2==1.1.1
116
  tokenizers==0.12.1
 
122
  tqdm==4.64.0
123
  traitlets==5.2.1.post0
124
  transformers==4.19.2
125
+ typer==0.4.1
126
  typing_extensions==4.2.0
127
  tzdata==2022.1
128
  tzlocal==4.2
129
  urllib3==1.26.9
130
  validators==0.19.0
131
+ wasabi==0.9.1
132
  watchdog==2.1.8
133
  wcwidth==0.2.5
134
  webencodings==0.5.1
src/abstractive_sum.py CHANGED
@@ -12,9 +12,8 @@ import pandas as pd
12
  import torch
13
  from torch.utils.data import Dataset, DataLoader
14
  from sklearn.model_selection import train_test_split
 
15
 
16
-
17
- import torch.nn as nn
18
  from tqdm.auto import tqdm
19
 
20
  from transformers import (
@@ -32,7 +31,7 @@ MODEL_PATH = "models/"
32
  MODEL_FILENAME = "t5-base.model"
33
 
34
  MODEL_NAME = "t5-base"
35
- # TOKENIZER = T5Tokenizer.from_pretrained(MODEL_NAME)
36
 
37
  TEXT_MAX_TOKEN_LEN = 512
38
  SUMMARY_MAX_TOKEN_LEN = 128
@@ -56,13 +55,14 @@ class LicenseSummaryDataset(Dataset):
56
  self.text_max_token_len = text_max_token_len
57
  self.summary_max_token_len = summary_max_token_len
58
 
 
59
  def __len__(self):
60
  return len(self.data)
61
 
62
  def __getitem__(self, index: int):
63
  data_row = self.data.iloc[index]
64
  text = data_row["text"]
65
- text_encoding = TOKENIZER(
66
  text,
67
  max_length=self.text_max_token_len,
68
  padding="max_length",
@@ -72,7 +72,7 @@ class LicenseSummaryDataset(Dataset):
72
  return_tensors="pt"
73
  )
74
 
75
- summary_encoding = TOKENIZER(
76
  data_row["summary"],
77
  max_length=self.summary_max_token_len,
78
  padding="max_length",
@@ -111,6 +111,8 @@ def prepare_dataloaders():
111
 
112
  train_df, dev_df = train_test_split(license_summary_data, test_size=0.1)
113
 
 
 
114
  train_dataset = LicenseSummaryDataset(
115
  train_df,
116
  TOKENIZER,
@@ -239,6 +241,8 @@ def summarize_text_with_model(text, model, tokenizer):
239
  Summary of the License text from the given model.
240
 
241
  """
 
 
242
  text_encoding = tokenizer(
243
  text,
244
  max_length=TEXT_MAX_TOKEN_LEN,
@@ -267,10 +271,10 @@ def summarize_text_with_model(text, model, tokenizer):
267
  ) for gen_id in generated_ids
268
  ]
269
 
270
- return "".join(preds)
271
 
272
 
273
- def summarize(text, load_from_huggingface=False):
274
  """
275
  Summarizes the given License text
276
 
@@ -295,6 +299,7 @@ def summarize(text, load_from_huggingface=False):
295
  if os.path.exists(MODEL_PATH + MODEL_FILENAME):
296
  print("Loading Model...")
297
  model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True).to(device)
 
298
  model.load_state_dict(torch.load(MODEL_PATH + MODEL_FILENAME))
299
  model.eval()
300
  else:
@@ -305,8 +310,7 @@ def summarize(text, load_from_huggingface=False):
305
  model = train_and_save_model(train_dataloader, MODEL_PATH + MODEL_FILENAME)
306
  tokenizer = TOKENIZER
307
 
308
- summary = summarize_text_with_model(text, model, tokenizer)
309
- return summary
310
 
311
 
312
  def summarize_license_files(path):
@@ -322,6 +326,6 @@ def summarize_license_files(path):
322
  paths = glob.glob(path + "*.txt")
323
  for license_path in paths:
324
  with open(license_path, "r", encoding="utf-8") as f:
325
- summary = summarize(f.read())
326
  with open(license_path.replace(".txt", "") + "__summary.txt", "w", encoding="utf-8") as f:
327
  f.write(summary)
 
12
  import torch
13
  from torch.utils.data import Dataset, DataLoader
14
  from sklearn.model_selection import train_test_split
15
+ from src.clean import clean_license_text
16
 
 
 
17
  from tqdm.auto import tqdm
18
 
19
  from transformers import (
 
31
  MODEL_FILENAME = "t5-base.model"
32
 
33
  MODEL_NAME = "t5-base"
34
+ TOKENIZER = None
35
 
36
  TEXT_MAX_TOKEN_LEN = 512
37
  SUMMARY_MAX_TOKEN_LEN = 128
 
55
  self.text_max_token_len = text_max_token_len
56
  self.summary_max_token_len = summary_max_token_len
57
 
58
+
59
  def __len__(self):
60
  return len(self.data)
61
 
62
  def __getitem__(self, index: int):
63
  data_row = self.data.iloc[index]
64
  text = data_row["text"]
65
+ text_encoding = self.tokenizer(
66
  text,
67
  max_length=self.text_max_token_len,
68
  padding="max_length",
 
72
  return_tensors="pt"
73
  )
74
 
75
+ summary_encoding = self.tokenizer(
76
  data_row["summary"],
77
  max_length=self.summary_max_token_len,
78
  padding="max_length",
 
111
 
112
  train_df, dev_df = train_test_split(license_summary_data, test_size=0.1)
113
 
114
+ TOKENIZER = T5Tokenizer.from_pretrained(MODEL_NAME)
115
+
116
  train_dataset = LicenseSummaryDataset(
117
  train_df,
118
  TOKENIZER,
 
241
  Summary of the License text from the given model.
242
 
243
  """
244
+ text, definitions = clean_license_text(text)
245
+
246
  text_encoding = tokenizer(
247
  text,
248
  max_length=TEXT_MAX_TOKEN_LEN,
 
271
  ) for gen_id in generated_ids
272
  ]
273
 
274
+ return "".join(preds), definitions
275
 
276
 
277
+ def summarize(text, load_from_huggingface=True):
278
  """
279
  Summarizes the given License text
280
 
 
299
  if os.path.exists(MODEL_PATH + MODEL_FILENAME):
300
  print("Loading Model...")
301
  model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True).to(device)
302
+ TOKENIZER = T5Tokenizer.from_pretrained(MODEL_NAME)
303
  model.load_state_dict(torch.load(MODEL_PATH + MODEL_FILENAME))
304
  model.eval()
305
  else:
 
310
  model = train_and_save_model(train_dataloader, MODEL_PATH + MODEL_FILENAME)
311
  tokenizer = TOKENIZER
312
 
313
+ return summarize_text_with_model(text, model, tokenizer)
 
314
 
315
 
316
  def summarize_license_files(path):
 
326
  paths = glob.glob(path + "*.txt")
327
  for license_path in paths:
328
  with open(license_path, "r", encoding="utf-8") as f:
329
+ summary, _ = summarize(f.read())
330
  with open(license_path.replace(".txt", "") + "__summary.txt", "w", encoding="utf-8") as f:
331
  f.write(summary)
src/clean.py CHANGED
@@ -1,27 +1,46 @@
1
  import re
2
- import os
3
  from bs4 import BeautifulSoup
4
  from striprtf.striprtf import rtf_to_text
5
- import json
6
- import nltk as nltk
 
7
 
8
 
9
  def php_cleaner(text):
10
- return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
 
 
 
 
 
 
11
  def html_cleaner(text):
12
  soup = BeautifulSoup(text)
13
- return soup.body.text
14
- def json_cleaner(text):
15
- out = ""
16
- for keys in text:
17
- if keys in ('description', 'license'):
18
- out+=keys
19
- out+=": "
20
- out+=str(text[keys])
21
- out+=", "
 
 
 
 
 
22
  return out
 
 
 
 
 
 
23
  def gnu_cleaner(text):
24
  t = text.split('END OF TERMS AND CONDITIONS')[0]
 
25
  if 'Preamble' in text:
26
  if len(t.split('Preamble')[0])>100:
27
  t0 = t.split('Preamble')[0]
@@ -32,49 +51,121 @@ def gnu_cleaner(text):
32
  t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
33
  except:
34
  t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
35
- return t0+t1
36
  else:
37
- return t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
38
- else:
39
- return t
 
 
 
 
 
 
 
 
 
40
  def rtf_cleaner(text):
41
  return rtf_to_text(text)
42
- def character_cleaner(text):
43
- return re.sub("[=*-/·\n]+", "", text)
44
  def url_cleaner(text):
45
- return re.sub(r'http\S+', '', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def isEnglish(s):
47
  try:
48
- s.encode(encoding='utf-8').decode('ascii')
49
  except UnicodeDecodeError:
50
  return False
51
  else:
52
  return True
53
-
54
- # input as a text
55
- def clean_license_text(text):
56
- text = text.strip()
57
- if text[:5] == '<?php':
58
- try:
59
- t = php_cleaner(text)
60
- except:
61
- return ""
 
 
 
 
62
  elif "</html>" in text:
63
- t = html_cleaner(text)
64
- elif text[0] == '{' and text[-1] == '}':
65
- with open(file, 'r') as f:
66
- t = json_cleaner(json.load(f))
67
- elif "GNU" in text or "Apache" in text:
68
- t = gnu_cleaner(text)
69
  elif "\\rtf" in text:
70
- t = rtf_cleaner(text)
 
 
 
 
 
 
 
 
 
 
 
 
71
  else:
72
- t = text
 
 
 
 
 
 
 
 
73
 
74
- t = url_cleaner(t)
75
- t = character_cleaner(t)
76
 
77
- if not isEnglish(t):
78
- if not isEnglish(' '.join(t.split()[-5:-1])):
79
- return ""
80
- return t
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
+ import json
3
  from bs4 import BeautifulSoup
4
  from striprtf.striprtf import rtf_to_text
5
+
6
+
7
+ PARA_BREAK = "para___break"
8
 
9
 
10
  def php_cleaner(text):
11
+ try:
12
+ return re.findall("\/\*[\S\s]*?\*\/", text)[0]
13
+ except:
14
+ return ""
15
+ # return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
16
+
17
+
18
  def html_cleaner(text):
19
  soup = BeautifulSoup(text)
20
+ text = soup.body.text
21
+ if not text:
22
+ return ""
23
+ return text
24
+
25
+
26
+ def json_cleaner(text_dict):
27
+ out = ""
28
+ for key in text_dict.keys():
29
+ if key in ("description", "license"):
30
+ out += key
31
+ out += ": "
32
+ out += str(text_dict[key])
33
+ out += ", "
34
  return out
35
+
36
+
37
+ def discard_text_after_tnc(text):
38
+ return text.split("END OF TERMS AND CONDITIONS")[0]
39
+
40
+
41
  def gnu_cleaner(text):
42
  t = text.split('END OF TERMS AND CONDITIONS')[0]
43
+ definitions = ""
44
  if 'Preamble' in text:
45
  if len(t.split('Preamble')[0])>100:
46
  t0 = t.split('Preamble')[0]
 
51
  t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
52
  except:
53
  t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
54
+ t = t0+t1
55
  else:
56
+ t = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
57
+ if 'Definitions' in text:
58
+ try:
59
+ def_pos = re.search(r"[0-9]\.? (Additional )?Definitions",t).span()
60
+ other_start_pos = re.search(r"[0-9]\.? [A-Z][a-z]+",t[def_pos[1]:]).span()[0]
61
+ definitions = t[def_pos[0]: def_pos[1] + other_start_pos]
62
+ t = t[:def_pos[0]] + t[def_pos[1]+other_start_pos:]
63
+ except:
64
+ t = t
65
+ return t, definitions
66
+
67
+
68
  def rtf_cleaner(text):
69
  return rtf_to_text(text)
70
+
71
+
72
  def url_cleaner(text):
73
+ return re.sub(r"http\S+", "", text)
74
+
75
+
76
+ def email_cleaner(text):
77
+ return re.sub(r"\S*@\S*", "", text)
78
+
79
+
80
+ def var_cleaner(text):
81
+ text = re.sub(r"\$\w+", "", text)
82
+ text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
83
+ return text
84
+
85
+
86
+ def character_cleaner(text):
87
+ text = url_cleaner(text)
88
+ text = email_cleaner(text)
89
+ text = var_cleaner(text)
90
+
91
+ text = re.sub("[\n]{2,}", ". ", text)
92
+ text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
93
+ text = re.sub("[\. ]{2,}", ". ", text)
94
+ return text
95
+
96
+
97
  def isEnglish(s):
98
  try:
99
+ s.encode(encoding="utf-8").decode("ascii")
100
  except UnicodeDecodeError:
101
  return False
102
  else:
103
  return True
104
+
105
+
106
+ def preprocess_text(text):
107
+ definitions = ""
108
+ if "GNU" in text or "Apache" in text:
109
+ text, definitions = gnu_cleaner(text)
110
+ definitions = definitions.strip()
111
+ return text, definitions
112
+
113
+
114
+ def script_cleaner(text):
115
+ if "<?php" in text:
116
+ text = php_cleaner(text)
117
  elif "</html>" in text:
118
+ text = html_cleaner(text)
119
+ elif text[0] == "{" and text[-1] == "}":
120
+ text = json_cleaner(json.loads(text))
 
 
 
121
  elif "\\rtf" in text:
122
+ text = rtf_cleaner(text)
123
+ if not text:
124
+ return ""
125
+ return text
126
+
127
+
128
+ def split_paras(text):
129
+ if "\n\n\n\n" in text:
130
+ paras = text.split("\n\n\n\n")
131
+ elif "\n\n\n" in text:
132
+ paras = text.split("\n\n\n")
133
+ elif "\n\n" in text:
134
+ paras = text.split("\n\n")
135
  else:
136
+ paras = [text]
137
+ return paras
138
+
139
+
140
+ def clean_paras(paras):
141
+ return paras
142
+
143
+
144
+ def clean_license_text(text):
145
 
146
+ if len(text) == 0:
147
+ return text
148
 
149
+ text = script_cleaner(text)
150
+ text, definitions = preprocess_text(text)
151
+ paras = clean_paras(split_paras(text))
152
+ text = PARA_BREAK.join(paras)
153
+ text = character_cleaner(text)
154
+ text = re.sub(PARA_BREAK, "\n\n", text)
155
+ text = text.strip()
156
+
157
+ if not isEnglish(text):
158
+ if not isEnglish(" ".join(text.split()[-5:-1])):
159
+ return "", ""
160
+
161
+ return text, definitions
162
+
163
+
164
+ """
165
+ Notes:
166
+
167
+ 1. Regex for other definitions: --------> ".{0,20}".{0,40}means
168
+ 2. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
169
+ -> Merge with the next para
170
+ Ex. "8. Termination."
171
+ """
src/read_data.py CHANGED
@@ -129,7 +129,7 @@ def augment_summary(license_data):
129
  return license_data
130
 
131
 
132
- def read_license_data(labels_file="choosealicense_appendix_labels.csv"):
133
  """
134
  Reads data from Text and Summary File and stores it as a dictionary of
135
  dictionaries.
@@ -142,21 +142,46 @@ def read_license_data(labels_file="choosealicense_appendix_labels.csv"):
142
  corresponding summaries and license texts respectively.
143
 
144
  """
 
145
  files = glob.glob(gold_licenses_data + "*")
 
 
 
 
 
 
146
  data_dict = defaultdict(dict)
 
147
  for file_path in files:
 
 
 
 
 
148
  if file_path.endswith(".summary"):
149
- file_name = file_path.split("/")[-1][:-8]
150
  data_dict[file_name]["summary"] = read_file(file_path)
151
  elif file_path.endswith(".txt"):
152
- file_name = file_path.split("/")[-1][:-4]
153
  data_dict[file_name]["text"] = clean_data(read_file(file_path))
154
 
155
  summary_df = pd.DataFrame(data_dict).T
156
- labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col)
157
 
158
- merged_data = labels_df.join(summary_df).drop(columns=["spdx_id", "GTLC_Notes"])
159
- return merged_data[:5]
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
 
162
  def read_license_summary_data(aug_summary=False):
@@ -193,11 +218,11 @@ def fix_labels(license_data):
193
  "limitations": 2
194
  }
195
 
196
- permissive_not_permissive_map = {
197
- np.nan: 0,
198
- "permissive": 1,
199
- "not_permissive": 2
200
- }
201
 
202
  permissions_columns = [
203
  "commercial-use",
@@ -226,14 +251,14 @@ def fix_labels(license_data):
226
  "patent-use"
227
  ]
228
 
229
- permissive_not_permissive_columns = [
230
- "GTLC_Permissive"
231
- ]
232
 
233
  license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map)
234
  license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map)
235
  license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map)
236
  license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map)
237
- license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map)
238
 
239
  return license_data
 
129
  return license_data
130
 
131
 
132
+ def read_license_data(labels_file="choosealicense_appendix_labels.csv", drop_summary=False):
133
  """
134
  Reads data from Text and Summary File and stores it as a dictionary of
135
  dictionaries.
 
142
  corresponding summaries and license texts respectively.
143
 
144
  """
145
+
146
  files = glob.glob(gold_licenses_data + "*")
147
+ if not files:
148
+ files = glob.glob(f"../{gold_licenses_data}" + "*")
149
+ if not files:
150
+ print("Gold licenses not found, please check the path again!")
151
+ return None
152
+
153
  data_dict = defaultdict(dict)
154
+
155
  for file_path in files:
156
+ if "\\" in file_path:
157
+ split_by = "\\"
158
+ else:
159
+ split_by = "/"
160
+
161
  if file_path.endswith(".summary"):
162
+ file_name = file_path.split(split_by)[-1][:-8]
163
  data_dict[file_name]["summary"] = read_file(file_path)
164
  elif file_path.endswith(".txt"):
165
+ file_name = file_path.split(split_by)[-1][:-4]
166
  data_dict[file_name]["text"] = clean_data(read_file(file_path))
167
 
168
  summary_df = pd.DataFrame(data_dict).T
 
169
 
170
+ try:
171
+ labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col)
172
+ except:
173
+ try:
174
+ labels_df = pd.read_csv(f"../{data_directory}" + labels_file, index_col=index_col)
175
+ except:
176
+ print("Labels file not found, please check the path again!")
177
+ return None
178
+
179
+ merged_data = labels_df.join(summary_df).drop(columns=["spdx_id"])
180
+
181
+ if drop_summary:
182
+ merged_data = merged_data.drop(columns=["summary"])
183
+
184
+ return merged_data
185
 
186
 
187
  def read_license_summary_data(aug_summary=False):
 
218
  "limitations": 2
219
  }
220
 
221
+ # permissive_not_permissive_map = {
222
+ # np.nan: 0,
223
+ # "permissive": 1,
224
+ # "not_permissive": 2
225
+ # }
226
 
227
  permissions_columns = [
228
  "commercial-use",
 
251
  "patent-use"
252
  ]
253
 
254
+ # permissive_not_permissive_columns = [
255
+ # "GTLC_Permissive"
256
+ # ]
257
 
258
  license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map)
259
  license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map)
260
  license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map)
261
  license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map)
262
+ # license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map)
263
 
264
  return license_data
src/textrank.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import numpy as np
3
+ import gensim
4
+ import spacy
5
+ import math
6
+ from collections import Counter
7
+
8
+ from src.clean import clean_license_text
9
+ from src.read_data import read_file
10
+
11
+ nltk.download('punkt')
12
+
13
+ properties_dict = {
14
+ "modify":['modify', 'modification', 'change'],
15
+ "distribute":['distribute', 'distribution'],
16
+ "copy":['copy'],
17
+ "copyright": ['copyright']
18
+ # "exception"
19
+ }
20
+
21
+ properties_scores = {
22
+ "modify": 0.8,
23
+ "distribute": 0.8,
24
+ "copy": 0.8,
25
+ "copyright": 0.9
26
+ }
27
+
28
+ nlp = spacy.load('en_core_web_sm')
29
+
30
+ def lemmatize_tokens(sent):
31
+ #TODO: Docstrings
32
+ '''each word in input sentence is converted to lemma'''
33
+ return [token.lemma_.lower() for token in nlp(sent)]
34
+
35
+
36
+ def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False):
37
+ '''
38
+ TODO: Doctrings
39
+ '''
40
+ summary_len = math.ceil(summary_len*len(license_text.split('.')))
41
+ sent_scores = {}
42
+ cleaned_license_text, definitions = clean_license_text(license_text)
43
+ for i in cleaned_license_text.split('.'):
44
+ if debug:
45
+ print(i.split())
46
+ if len(i.split()) < min_sent_len:
47
+ break
48
+ score = 0
49
+ for prop, prop_words in properties_dict.items():
50
+ prop_score = 0
51
+ lemmatized_tokens = lemmatize_tokens(i)
52
+ word_count = Counter([tok for tok in lemmatized_tokens])
53
+ for prop_word in prop_words:
54
+ if prop_word in word_count.keys():
55
+ prop_score += properties_scores[prop_word]
56
+ if debug:
57
+ print(prop, "=", prop_score)
58
+ score += prop_score
59
+ sent_scores[i] = score/len(lemmatized_tokens)
60
+ if debug:
61
+ print(f'Sentence score: {sent_scores[i]}')
62
+ print()
63
+ if debug:
64
+ print(sent_scores)
65
+ sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
66
+ summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
67
+ return summary, definitions
68
+
69
+