mhsvieira commited on
Commit
e539b70
1 Parent(s): 515969f

Add current system

Browse files
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from extractor import extract, FewDocumentsError
3
+ from summarizer import summarize
4
+ import time
5
+
6
+ # TODO: translation
7
+
8
+ st.title("Trabalho de Formatura - Construindo textos para a internet")
9
+ st.subheader("Lucas Antunes e Matheus Vieira")
10
+
11
+ st.subheader("Digite o tópico sobre o qual você deseja gerar um resumo")
12
+ query = st.text_input('Digite o tópico em inglês') #text is stored in this variable
13
+
14
+ if 'few_documents' not in st.session_state:
15
+ st.session_state['few_documents'] = False
16
+ few_documents = False
17
+ else:
18
+ few_documents = st.session_state['few_documents']
19
+
20
+ button1 = st.button('Gerar resumo')
21
+
22
+ if button1:
23
+ start_time = time.time()
24
+ try:
25
+ with st.spinner('Extraindo textos relevantes...'):
26
+ text = extract(query)
27
+ except FewDocumentsError as e:
28
+ few_documents = True
29
+ st.session_state['few_documents'] = True
30
+ st.session_state['documents'] = e.documents
31
+ st.session_state['msg'] = e.msg
32
+ else:
33
+
34
+ st.info(f'(Extraction) Elapsed time: {time.time() - start_time:.2f}s')
35
+ with st.spinner('Gerando resumo...'):
36
+ summary = summarize(text)
37
+ st.info(f'(Total) Elapsed time: {time.time() - start_time:.2f}s')
38
+
39
+ st.markdown(f'Seu resumo para "{query}":\n\n> {summary}')
40
+
41
+
42
+ if few_documents:
43
+ st.warning(st.session_state['msg'])
44
+ if st.button('Prosseguir'):
45
+ start_time = time.time()
46
+ with st.spinner('Extraindo textos relevantes...'):
47
+ text = extract(query, extracted_documents=st.session_state['documents'])
48
+ st.info(f'(Extraction) Elapsed time: {time.time() - start_time:.2f}s')
49
+ with st.spinner('Gerando resumo...'):
50
+ summary = summarize(text)
51
+ st.info(f'(Total) Elapsed time: {time.time() - start_time:.2f}s')
52
+
53
+ st.markdown(f'Seu resumo para "{query}":\n\n> {summary}')
54
+
55
+ st.session_state['few_documents'] = False
56
+ few_documents = False
57
+
corpora/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .corpora import gen_corpus
corpora/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (176 Bytes). View file
 
corpora/__pycache__/corpora.cpython-39.pyc ADDED
Binary file (613 Bytes). View file
 
corpora/__pycache__/pira.cpython-39.pyc ADDED
Binary file (304 Bytes). View file
 
corpora/__pycache__/sourcer.cpython-39.pyc ADDED
Binary file (1.14 kB). View file
 
corpora/corpora.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .sourcer import search_web
2
+ import pandas as pd
3
+ import os
4
+
5
+ root_dir = 'data/datasets'
6
+ pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
7
+
8
+ def gen_corpus(query: str, pira: bool=True, ONU: bool=True, web: bool=True)->list:
9
+ corpus = []
10
+ if not (pira or ONU or web):
11
+ # TODO: raise error
12
+ pass
13
+ if pira:
14
+ corpus += pira_df.text.to_list()
15
+ if ONU:
16
+ # TODO: implement PDFs
17
+ pass
18
+ if web:
19
+ corpus += search_web(query)
20
+
21
+ return corpus
corpora/pira.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+
4
+ # Open dataset
5
+ root_dir = 'data/datasets'
6
+ pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
7
+ pira = pira_df.text.to_list()
corpora/sourcer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from string import Template
3
+ from bs4 import BeautifulSoup
4
+ from dotenv import load_dotenv
5
+ from os import getenv
6
+
7
+ load_dotenv()
8
+
9
+ google_key = getenv('GOOGLE_KEY')
10
+ google_engine = getenv('GOOGLE_ENGINE')
11
+
12
+ url = Template(f'https://www.googleapis.com/customsearch/v1?key={google_key}&cx={google_engine}&q=$query')
13
+
14
+ def search_web(query: str) -> list:
15
+ query = '+'.join(query.split())
16
+ results = requests.get(url.substitute(query=query)).json()['items']
17
+
18
+ links = [item['link'] for item in results]
19
+
20
+ texts = []
21
+ for link in links:
22
+ resp = requests.get(link)
23
+ soup = BeautifulSoup(resp.text, 'html.parser')
24
+ text = []
25
+ # remove lists
26
+ for tag in soup.find_all('li'):
27
+ tag.extract()
28
+
29
+ tags = soup.find_all('p')
30
+ for tag in tags:
31
+ text.append(tag.text)
32
+ texts.append('\n'.join(text))
33
+
34
+ return texts
data/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Store data here.
data/datasets/pira_simplified.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/datasets/pira_simplified.csv:Zone.Identifier ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [ZoneTransfer]
2
+ ZoneId=3
3
+ HostUrl=https://doc-0s-14-docs.googleusercontent.com/docs/securesc/l4sk7borm3s5kl6jhblek6c66jbnuvpo/kcpqkljqgl6hfdmmo6pq7s9l9msu1pqg/1633831125000/13208144011500786805/13208144011500786805/1-3OY2MTvTqLoOtkD2-iQrp9lbRMfEReh?e=download&authuser=2&nonce=r5s0v4hsue126&user=13208144011500786805&hash=v8qg2imo0qo63hdo4j6qker2n7b7ijto
extractor/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .extract import extract
2
+ from ._utils import FewDocumentsError
extractor/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (214 Bytes). View file
 
extractor/__pycache__/_utils.cpython-39.pyc ADDED
Binary file (4.82 kB). View file
 
extractor/__pycache__/extract.cpython-39.pyc ADDED
Binary file (2.13 kB). View file
 
extractor/_utils.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nmslib
2
+ import numpy as np
3
+ import streamlit as st
4
+ import inflect
5
+ import torch
6
+
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+
9
+ p = inflect.engine()
10
+
11
+ class FewDocumentsError(Exception):
12
+ def __init__(self, documents, size, msg):
13
+ self.documents = documents
14
+ self.size = size
15
+ self.msg = msg
16
+
17
+ def __str__(self):
18
+ return repr(self.msg)
19
+
20
+ def document_extraction(dataset, query, keywords, min_document_size, min_just_one_paragraph_size):
21
+ word_in_text = lambda word, text: any([p.compare(word, w) for w in text.split()])
22
+ lower_dataset = [document.lower() for document in dataset]
23
+ lower_query = query.lower()
24
+ lower_keywords = [keyword.lower() for keyword in keywords]
25
+
26
+ documents = {}
27
+
28
+ documents['QUERY'] = [
29
+ dataset[lower_dataset.index(document)] for document in lower_dataset
30
+ if (word_in_text(lower_query, document))
31
+ and (len(document.split()) > min_document_size)
32
+ and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
33
+ ]
34
+
35
+ documents['AND'] = [
36
+ dataset[lower_dataset.index(document)] for document in lower_dataset
37
+ if all(word_in_text(keyword, document) for keyword in lower_keywords)
38
+ and (len(document.split()) > min_document_size)
39
+ and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
40
+ ]
41
+
42
+ documents['OR'] = [
43
+ dataset[lower_dataset.index(document)] for document in lower_dataset
44
+ if any(word_in_text(keyword, document) for keyword in lower_keywords)
45
+ and (len(document.split()) > min_document_size)
46
+ and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
47
+ ]
48
+
49
+ empty = {
50
+ 'QUERY': len(documents['QUERY']) == 0,
51
+ 'AND': len(documents['AND']) == 0,
52
+ 'OR': len(documents['OR']) == 0
53
+ }
54
+
55
+ sizes = {
56
+ 'QUERY': len(documents['QUERY']),
57
+ 'AND': len(documents['AND']),
58
+ 'OR': len(documents['OR'])
59
+ }
60
+
61
+ if all(empty.values()):
62
+ # TODO: throw error
63
+ st.info(empty.values())
64
+ st.warning(f'No document found for the query "{query}", please try with another query')
65
+ st.stop()
66
+
67
+ if sizes['QUERY'] >= 10:
68
+ extracted_documents = documents['QUERY']
69
+ elif sizes['AND'] >= 10:
70
+ extracted_documents = documents['AND']
71
+ elif sizes['OR'] >= 10:
72
+ extracted_documents = documents['OR']
73
+ else:
74
+ number_of_documents = sizes['OR']
75
+ raise FewDocumentsError(documents['OR'], number_of_documents,
76
+ f'Only {number_of_documents} documents found for the query "{query}"\n\
77
+ Please select continue to proceed with {number_of_documents} documents or try again with another query'
78
+ )
79
+
80
+ return extracted_documents, empty, sizes
81
+
82
+ def paragraph_extraction(documents, min_paragraph_size):
83
+ paragraphs = [
84
+ documents[i].splitlines()[j] for i in range(len(documents)) for j in range(len(documents[i].splitlines()))
85
+ if (len(documents[i].splitlines()[j].split()) > min_paragraph_size)
86
+ ]
87
+
88
+ return paragraphs
89
+
90
+ def semantic_search(model, query, files, number_of_similar_files):
91
+ encoded_query = model.encode(query, device=device)
92
+ encoded_files = model.encode(files, device=device)
93
+
94
+ model_index = nmslib.init(method='hnsw', space='angulardist')
95
+ model_index.addDataPointBatch(encoded_files)
96
+ model_index.createIndex({'post': 2})
97
+
98
+ ids, distances = model_index.knnQuery(encoded_query, k=number_of_similar_files)
99
+
100
+ selected_files = [files[index] for index in ids]
101
+
102
+ distances = 180*distances/np.pi
103
+
104
+ return selected_files, distances;
extractor/extract.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from ._utils import FewDocumentsError
3
+ from ._utils import document_extraction, paragraph_extraction, semantic_search
4
+ from corpora import gen_corpus
5
+ from nltk.corpus import stopwords
6
+ from nltk.tokenize import word_tokenize
7
+ import string
8
+
9
+ from ._utils import device
10
+
11
+ def extract(query: str, n: int=3, extracted_documents: list=None) -> str:
12
+ """Extract n paragraphs from the corpus using the given query.
13
+
14
+ Parameters:
15
+ query (str): Sentence used to search the corpus for relevant documents
16
+ n (int): Number of paragraphs to return
17
+
18
+ Returns:
19
+ str: String containing the n most relevant paragraphs joined by line breaks
20
+ """
21
+ # Open corpus
22
+ corpus = gen_corpus(query)
23
+
24
+ # Setup query
25
+ stop_words = set(stopwords.words('english'))
26
+ query_tokens = word_tokenize(query.lower())
27
+ tokens_without_sw = [word for word in query_tokens if not word in stop_words]
28
+ keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]
29
+
30
+ # Gross search
31
+ if not extracted_documents:
32
+ extracted_documents, documents_empty, documents_sizes = document_extraction(
33
+ dataset=corpus,
34
+ query=query,
35
+ keywords=keywords,
36
+ min_document_size=0,
37
+ min_just_one_paragraph_size=0
38
+ )
39
+
40
+ # First semantc search (over documents)
41
+ # Model for semantic searches
42
+ search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device)
43
+ selected_documents, documents_distances = semantic_search(
44
+ model=search_model,
45
+ query=query,
46
+ files=extracted_documents,
47
+ number_of_similar_files=10
48
+ )
49
+
50
+ # Second semantic search (over paragraphs)
51
+ paragraphs = paragraph_extraction(
52
+ documents=selected_documents,
53
+ min_paragraph_size=20,
54
+ )
55
+
56
+ # Model for the second semantic search
57
+ selected_paragraphs, paragraphs_distances = semantic_search(
58
+ model=search_model,
59
+ query=query,
60
+ files=paragraphs,
61
+ number_of_similar_files=10
62
+ )
63
+
64
+ from pprint import pprint
65
+ pprint(selected_paragraphs[:n])
66
+
67
+ text = '\n'.join(selected_paragraphs[:n])
68
+
69
+ return text
70
+
requirements.txt ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.1.0
2
+ argh @ file:///home/conda/feedstock_root/build_artifacts/argh_1595627874344/work
3
+ argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1633990448879/work
4
+ astor @ file:///home/conda/feedstock_root/build_artifacts/astor_1593610464257/work
5
+ async-generator==1.10
6
+ attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1620387926260/work
7
+ backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
8
+ backports.functools-lru-cache @ file:///home/conda/feedstock_root/build_artifacts/backports.functools_lru_cache_1618230623929/work
9
+ base58 @ file:///home/conda/feedstock_root/build_artifacts/base58_1610222858736/work
10
+ beautifulsoup4 @ file:///home/conda/feedstock_root/build_artifacts/beautifulsoup4_1631087867185/work
11
+ bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1629908509068/work
12
+ blinker==1.4
13
+ blis @ file:///home/conda/feedstock_root/build_artifacts/cython-blis_1607338152218/work
14
+ boto3 @ file:///home/conda/feedstock_root/build_artifacts/boto3_1633765668486/work
15
+ botocore @ file:///home/conda/feedstock_root/build_artifacts/botocore_1633987764519/work
16
+ brotlipy==0.7.0
17
+ cachetools @ file:///home/conda/feedstock_root/build_artifacts/cachetools_1633010882559/work
18
+ catalogue @ file:///home/conda/feedstock_root/build_artifacts/catalogue_1632148432306/work
19
+ certifi==2021.10.8
20
+ cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1631636250774/work
21
+ charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1632423846401/work
22
+ click==7.1.2
23
+ colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1602866480661/work
24
+ cryptography @ file:///tmp/build/80754af9/cryptography_1633520369886/work
25
+ cupy @ file:///home/conda/feedstock_root/build_artifacts/cupy_1633141077366/work
26
+ cymem @ file:///home/conda/feedstock_root/build_artifacts/cymem_1625012218510/work
27
+ dataclasses @ file:///home/conda/feedstock_root/build_artifacts/dataclasses_1628958434797/work
28
+ debugpy @ file:///home/conda/feedstock_root/build_artifacts/debugpy_1627074853231/work
29
+ decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1631346842025/work
30
+ defusedxml @ file:///home/conda/feedstock_root/build_artifacts/defusedxml_1615232257335/work
31
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl
32
+ entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1605121927639/work/dist/entrypoints-0.3-py2.py3-none-any.whl
33
+ fastrlock==0.6
34
+ filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1633273934192/work
35
+ gitdb @ file:///tmp/build/80754af9/gitdb_1617117951232/work
36
+ GitPython @ file:///home/conda/feedstock_root/build_artifacts/gitpython_1632423794953/work
37
+ huggingface-hub @ file:///home/conda/feedstock_root/build_artifacts/huggingface_hub_1633615623676/work
38
+ idna @ file:///tmp/build/80754af9/idna_1622654382723/work
39
+ importlib-metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1630267473458/work
40
+ inflect @ file:///home/conda/feedstock_root/build_artifacts/inflect_1635835916074/work
41
+ ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1631291098355/work/dist/ipykernel-6.4.1-py3-none-any.whl
42
+ ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1632763773116/work
43
+ ipython-genutils==0.2.0
44
+ ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1631590360471/work
45
+ jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1610146787869/work
46
+ Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1633656206378/work
47
+ jmespath @ file:///home/conda/feedstock_root/build_artifacts/jmespath_1589369830981/work
48
+ joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1633637554808/work
49
+ jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema_1633875207482/work
50
+ jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1633454794268/work
51
+ jupyter-core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1631852705892/work
52
+ jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1601375948261/work
53
+ jupyterlab-widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1631590465624/work
54
+ MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1621455668600/work
55
+ matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1631080358261/work
56
+ mistune @ file:///home/conda/feedstock_root/build_artifacts/mistune_1624941317779/work
57
+ mkl-fft==1.3.0
58
+ mkl-random==1.2.2
59
+ mkl-service==2.4.0
60
+ murmurhash @ file:///home/conda/feedstock_root/build_artifacts/murmurhash_1607334222000/work
61
+ nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1629120697898/work
62
+ nbconvert @ file:///home/conda/feedstock_root/build_artifacts/nbconvert_1632535927841/work
63
+ nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1617383142101/work
64
+ nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1617163391303/work
65
+ nltk @ file:///home/conda/feedstock_root/build_artifacts/nltk_1633093058893/work
66
+ nmslib==2.0.16
67
+ notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1631733685426/work
68
+ numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1629092040774/work
69
+ olefile @ file:///home/conda/feedstock_root/build_artifacts/olefile_1602866521163/work
70
+ packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1625323647219/work
71
+ pandas==1.2.5
72
+ pandocfilters @ file:///home/conda/feedstock_root/build_artifacts/pandocfilters_1631603243851/work
73
+ parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1617148930513/work
74
+ pathy @ file:///home/conda/feedstock_root/build_artifacts/pathy_1624897245984/work
75
+ pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1602535608087/work
76
+ pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
77
+ Pillow @ file:///tmp/build/80754af9/pillow_1625655408865/work
78
+ preshed @ file:///home/conda/feedstock_root/build_artifacts/preshed_1625048866397/work
79
+ prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1622586138406/work
80
+ prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1629903925368/work
81
+ protobuf==3.17.2
82
+ ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
83
+ pyarrow==3.0.0
84
+ pybind11 @ file:///home/conda/feedstock_root/build_artifacts/pybind11-split_1633408728392/work
85
+ pybind11-global @ file:///home/conda/feedstock_root/build_artifacts/pybind11-split_1633408728392/work
86
+ pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1593275161868/work
87
+ pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1620819959703/work
88
+ pydeck @ file:///home/conda/feedstock_root/build_artifacts/pydeck_1630093567483/work
89
+ Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1629119114968/work
90
+ pyOpenSSL @ file:///home/conda/feedstock_root/build_artifacts/pyopenssl_1633192417276/work
91
+ pyparsing==2.4.7
92
+ pyrsistent @ file:///tmp/build/80754af9/pyrsistent_1625052304482/work
93
+ PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1610291451001/work
94
+ python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work
95
+ python-dotenv @ file:///home/conda/feedstock_root/build_artifacts/python-dotenv_1633777049699/work
96
+ pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1633452062248/work
97
+ PyYAML==5.4.1
98
+ pyzmq @ file:///home/conda/feedstock_root/build_artifacts/pyzmq_1631793304627/work
99
+ regex @ file:///home/conda/feedstock_root/build_artifacts/regex_1633750069182/work
100
+ requests @ file:///tmp/build/80754af9/requests_1629994808627/work
101
+ s3transfer @ file:///home/conda/feedstock_root/build_artifacts/s3transfer_1626384238958/work
102
+ sacremoses @ file:///home/conda/feedstock_root/build_artifacts/sacremoses_1588857588528/work
103
+ scikit-learn @ file:///tmp/build/80754af9/scikit-learn_1621370406642/work
104
+ scipy @ file:///tmp/build/80754af9/scipy_1630606796912/work
105
+ Send2Trash @ file:///home/conda/feedstock_root/build_artifacts/send2trash_1628511208346/work
106
+ sentence-transformers @ file:///home/conda/feedstock_root/build_artifacts/sentence-transformers_1626294390713/work
107
+ shellingham @ file:///home/conda/feedstock_root/build_artifacts/shellingham_1612179560728/work
108
+ six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
109
+ smart-open @ file:///home/conda/feedstock_root/build_artifacts/smart_open_1630238320325/work
110
+ smmap @ file:///home/conda/feedstock_root/build_artifacts/smmap_1612632339838/work
111
+ soupsieve @ file:///tmp/build/80754af9/soupsieve_1616183228191/work
112
+ spacy @ file:///home/conda/feedstock_root/build_artifacts/spacy_1632244522110/work
113
+ spacy-legacy @ file:///home/conda/feedstock_root/build_artifacts/spacy-legacy_1625687473390/work
114
+ srsly @ file:///home/conda/feedstock_root/build_artifacts/srsly_1618231647618/work
115
+ streamlit @ file:///home/conda/feedstock_root/build_artifacts/streamlit_1633453177811/work
116
+ terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1631128166466/work
117
+ testpath @ file:///home/conda/feedstock_root/build_artifacts/testpath_1621261527237/work
118
+ thinc @ file:///home/conda/feedstock_root/build_artifacts/thinc_1632600737641/work
119
+ threadpoolctl @ file:///home/conda/feedstock_root/build_artifacts/threadpoolctl_1633102299089/work
120
+ tokenizers @ file:///home/conda/feedstock_root/build_artifacts/tokenizers_1632285667965/work
121
+ toml @ file:///home/conda/feedstock_root/build_artifacts/toml_1604308577558/work
122
+ toolz @ file:///home/conda/feedstock_root/build_artifacts/toolz_1600973991856/work
123
+ torch==1.9.1
124
+ torchaudio==0.9.0a0+a85b239
125
+ torchvision==0.10.1
126
+ tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1610094708661/work
127
+ tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1632160078689/work
128
+ traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1630423529112/work
129
+ transformers @ file:///home/conda/feedstock_root/build_artifacts/transformers_1633051764196/work
130
+ typer @ file:///home/conda/feedstock_root/build_artifacts/typer_1630326630489/work
131
+ typing-extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1632313171031/work
132
+ tzlocal @ file:///home/conda/feedstock_root/build_artifacts/tzlocal_1629721600364/work
133
+ urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1632350318291/work
134
+ validators @ file:///home/conda/feedstock_root/build_artifacts/validators_1608296160673/work
135
+ wasabi @ file:///home/conda/feedstock_root/build_artifacts/wasabi_1612156086016/work
136
+ watchdog @ file:///home/conda/feedstock_root/build_artifacts/watchdog_1633154568778/work
137
+ wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1600965781394/work
138
+ webencodings==0.5.1
139
+ widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1605475529901/work
140
+ zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1633302054558/work
summarizer/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .summarize import summarize
summarizer/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (167 Bytes). View file
 
summarizer/__pycache__/_utils.cpython-39.pyc ADDED
Binary file (115 Bytes). View file
 
summarizer/__pycache__/summarize.cpython-39.pyc ADDED
Binary file (1.04 kB). View file
 
summarizer/_utils.py ADDED
File without changes
summarizer/summarize.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+
3
+ def summarize(text: str) -> str:
4
+ """
5
+ Generate a summary based from the given text
6
+ """
7
+
8
+ # Model for abstraction
9
+ model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
10
+ tokenizer = AutoTokenizer.from_pretrained('t5-base')
11
+
12
+ input_tokens = tokenizer.encode(
13
+ f'summarize: {text}',
14
+ return_tensors='pt',
15
+ max_length=tokenizer.model_max_length,
16
+ truncation=True
17
+ )
18
+
19
+ summary_ids = model.generate(
20
+ input_tokens,
21
+ min_length=80,
22
+ max_length=150,
23
+ length_penalty=15,
24
+ num_beams=2
25
+ )
26
+
27
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
28
+
29
+ summary = '. '.join([phrase.capitalize() for phrase in summary.split('. ')])
30
+ if not summary[-1] == '.':
31
+ summary = summary + '.'
32
+
33
+ return summary