Spaces:
Runtime error
Runtime error
Add current system
Browse files- app.py +57 -0
- corpora/__init__.py +1 -0
- corpora/__pycache__/__init__.cpython-39.pyc +0 -0
- corpora/__pycache__/corpora.cpython-39.pyc +0 -0
- corpora/__pycache__/pira.cpython-39.pyc +0 -0
- corpora/__pycache__/sourcer.cpython-39.pyc +0 -0
- corpora/corpora.py +21 -0
- corpora/pira.py +7 -0
- corpora/sourcer.py +34 -0
- data/README.md +1 -0
- data/datasets/pira_simplified.csv +0 -0
- data/datasets/pira_simplified.csv:Zone.Identifier +3 -0
- extractor/__init__.py +2 -0
- extractor/__pycache__/__init__.cpython-39.pyc +0 -0
- extractor/__pycache__/_utils.cpython-39.pyc +0 -0
- extractor/__pycache__/extract.cpython-39.pyc +0 -0
- extractor/_utils.py +104 -0
- extractor/extract.py +70 -0
- requirements.txt +140 -0
- summarizer/__init__.py +1 -0
- summarizer/__pycache__/__init__.cpython-39.pyc +0 -0
- summarizer/__pycache__/_utils.cpython-39.pyc +0 -0
- summarizer/__pycache__/summarize.cpython-39.pyc +0 -0
- summarizer/_utils.py +0 -0
- summarizer/summarize.py +33 -0
app.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from extractor import extract, FewDocumentsError
|
3 |
+
from summarizer import summarize
|
4 |
+
import time
|
5 |
+
|
6 |
+
# TODO: translation
|
7 |
+
|
8 |
+
st.title("Trabalho de Formatura - Construindo textos para a internet")
|
9 |
+
st.subheader("Lucas Antunes e Matheus Vieira")
|
10 |
+
|
11 |
+
st.subheader("Digite o tópico sobre o qual você deseja gerar um resumo")
|
12 |
+
query = st.text_input('Digite o tópico em inglês') #text is stored in this variable
|
13 |
+
|
14 |
+
if 'few_documents' not in st.session_state:
|
15 |
+
st.session_state['few_documents'] = False
|
16 |
+
few_documents = False
|
17 |
+
else:
|
18 |
+
few_documents = st.session_state['few_documents']
|
19 |
+
|
20 |
+
button1 = st.button('Gerar resumo')
|
21 |
+
|
22 |
+
if button1:
|
23 |
+
start_time = time.time()
|
24 |
+
try:
|
25 |
+
with st.spinner('Extraindo textos relevantes...'):
|
26 |
+
text = extract(query)
|
27 |
+
except FewDocumentsError as e:
|
28 |
+
few_documents = True
|
29 |
+
st.session_state['few_documents'] = True
|
30 |
+
st.session_state['documents'] = e.documents
|
31 |
+
st.session_state['msg'] = e.msg
|
32 |
+
else:
|
33 |
+
|
34 |
+
st.info(f'(Extraction) Elapsed time: {time.time() - start_time:.2f}s')
|
35 |
+
with st.spinner('Gerando resumo...'):
|
36 |
+
summary = summarize(text)
|
37 |
+
st.info(f'(Total) Elapsed time: {time.time() - start_time:.2f}s')
|
38 |
+
|
39 |
+
st.markdown(f'Seu resumo para "{query}":\n\n> {summary}')
|
40 |
+
|
41 |
+
|
42 |
+
if few_documents:
|
43 |
+
st.warning(st.session_state['msg'])
|
44 |
+
if st.button('Prosseguir'):
|
45 |
+
start_time = time.time()
|
46 |
+
with st.spinner('Extraindo textos relevantes...'):
|
47 |
+
text = extract(query, extracted_documents=st.session_state['documents'])
|
48 |
+
st.info(f'(Extraction) Elapsed time: {time.time() - start_time:.2f}s')
|
49 |
+
with st.spinner('Gerando resumo...'):
|
50 |
+
summary = summarize(text)
|
51 |
+
st.info(f'(Total) Elapsed time: {time.time() - start_time:.2f}s')
|
52 |
+
|
53 |
+
st.markdown(f'Seu resumo para "{query}":\n\n> {summary}')
|
54 |
+
|
55 |
+
st.session_state['few_documents'] = False
|
56 |
+
few_documents = False
|
57 |
+
|
corpora/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .corpora import gen_corpus
|
corpora/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (176 Bytes). View file
|
|
corpora/__pycache__/corpora.cpython-39.pyc
ADDED
Binary file (613 Bytes). View file
|
|
corpora/__pycache__/pira.cpython-39.pyc
ADDED
Binary file (304 Bytes). View file
|
|
corpora/__pycache__/sourcer.cpython-39.pyc
ADDED
Binary file (1.14 kB). View file
|
|
corpora/corpora.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .sourcer import search_web
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
|
5 |
+
root_dir = 'data/datasets'
|
6 |
+
pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
|
7 |
+
|
8 |
+
def gen_corpus(query: str, pira: bool=True, ONU: bool=True, web: bool=True)->list:
|
9 |
+
corpus = []
|
10 |
+
if not (pira or ONU or web):
|
11 |
+
# TODO: raise error
|
12 |
+
pass
|
13 |
+
if pira:
|
14 |
+
corpus += pira_df.text.to_list()
|
15 |
+
if ONU:
|
16 |
+
# TODO: implement PDFs
|
17 |
+
pass
|
18 |
+
if web:
|
19 |
+
corpus += search_web(query)
|
20 |
+
|
21 |
+
return corpus
|
corpora/pira.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
|
4 |
+
# Open dataset
|
5 |
+
root_dir = 'data/datasets'
|
6 |
+
pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
|
7 |
+
pira = pira_df.text.to_list()
|
corpora/sourcer.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from string import Template
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from os import getenv
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
google_key = getenv('GOOGLE_KEY')
|
10 |
+
google_engine = getenv('GOOGLE_ENGINE')
|
11 |
+
|
12 |
+
url = Template(f'https://www.googleapis.com/customsearch/v1?key={google_key}&cx={google_engine}&q=$query')
|
13 |
+
|
14 |
+
def search_web(query: str) -> list:
|
15 |
+
query = '+'.join(query.split())
|
16 |
+
results = requests.get(url.substitute(query=query)).json()['items']
|
17 |
+
|
18 |
+
links = [item['link'] for item in results]
|
19 |
+
|
20 |
+
texts = []
|
21 |
+
for link in links:
|
22 |
+
resp = requests.get(link)
|
23 |
+
soup = BeautifulSoup(resp.text, 'html.parser')
|
24 |
+
text = []
|
25 |
+
# remove lists
|
26 |
+
for tag in soup.find_all('li'):
|
27 |
+
tag.extract()
|
28 |
+
|
29 |
+
tags = soup.find_all('p')
|
30 |
+
for tag in tags:
|
31 |
+
text.append(tag.text)
|
32 |
+
texts.append('\n'.join(text))
|
33 |
+
|
34 |
+
return texts
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Store data here.
|
data/datasets/pira_simplified.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/datasets/pira_simplified.csv:Zone.Identifier
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[ZoneTransfer]
|
2 |
+
ZoneId=3
|
3 |
+
HostUrl=https://doc-0s-14-docs.googleusercontent.com/docs/securesc/l4sk7borm3s5kl6jhblek6c66jbnuvpo/kcpqkljqgl6hfdmmo6pq7s9l9msu1pqg/1633831125000/13208144011500786805/13208144011500786805/1-3OY2MTvTqLoOtkD2-iQrp9lbRMfEReh?e=download&authuser=2&nonce=r5s0v4hsue126&user=13208144011500786805&hash=v8qg2imo0qo63hdo4j6qker2n7b7ijto
|
extractor/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .extract import extract
|
2 |
+
from ._utils import FewDocumentsError
|
extractor/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (214 Bytes). View file
|
|
extractor/__pycache__/_utils.cpython-39.pyc
ADDED
Binary file (4.82 kB). View file
|
|
extractor/__pycache__/extract.cpython-39.pyc
ADDED
Binary file (2.13 kB). View file
|
|
extractor/_utils.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nmslib
|
2 |
+
import numpy as np
|
3 |
+
import streamlit as st
|
4 |
+
import inflect
|
5 |
+
import torch
|
6 |
+
|
7 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
8 |
+
|
9 |
+
p = inflect.engine()
|
10 |
+
|
11 |
+
class FewDocumentsError(Exception):
|
12 |
+
def __init__(self, documents, size, msg):
|
13 |
+
self.documents = documents
|
14 |
+
self.size = size
|
15 |
+
self.msg = msg
|
16 |
+
|
17 |
+
def __str__(self):
|
18 |
+
return repr(self.msg)
|
19 |
+
|
20 |
+
def document_extraction(dataset, query, keywords, min_document_size, min_just_one_paragraph_size):
|
21 |
+
word_in_text = lambda word, text: any([p.compare(word, w) for w in text.split()])
|
22 |
+
lower_dataset = [document.lower() for document in dataset]
|
23 |
+
lower_query = query.lower()
|
24 |
+
lower_keywords = [keyword.lower() for keyword in keywords]
|
25 |
+
|
26 |
+
documents = {}
|
27 |
+
|
28 |
+
documents['QUERY'] = [
|
29 |
+
dataset[lower_dataset.index(document)] for document in lower_dataset
|
30 |
+
if (word_in_text(lower_query, document))
|
31 |
+
and (len(document.split()) > min_document_size)
|
32 |
+
and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
|
33 |
+
]
|
34 |
+
|
35 |
+
documents['AND'] = [
|
36 |
+
dataset[lower_dataset.index(document)] for document in lower_dataset
|
37 |
+
if all(word_in_text(keyword, document) for keyword in lower_keywords)
|
38 |
+
and (len(document.split()) > min_document_size)
|
39 |
+
and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
|
40 |
+
]
|
41 |
+
|
42 |
+
documents['OR'] = [
|
43 |
+
dataset[lower_dataset.index(document)] for document in lower_dataset
|
44 |
+
if any(word_in_text(keyword, document) for keyword in lower_keywords)
|
45 |
+
and (len(document.split()) > min_document_size)
|
46 |
+
and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
|
47 |
+
]
|
48 |
+
|
49 |
+
empty = {
|
50 |
+
'QUERY': len(documents['QUERY']) == 0,
|
51 |
+
'AND': len(documents['AND']) == 0,
|
52 |
+
'OR': len(documents['OR']) == 0
|
53 |
+
}
|
54 |
+
|
55 |
+
sizes = {
|
56 |
+
'QUERY': len(documents['QUERY']),
|
57 |
+
'AND': len(documents['AND']),
|
58 |
+
'OR': len(documents['OR'])
|
59 |
+
}
|
60 |
+
|
61 |
+
if all(empty.values()):
|
62 |
+
# TODO: throw error
|
63 |
+
st.info(empty.values())
|
64 |
+
st.warning(f'No document found for the query "{query}", please try with another query')
|
65 |
+
st.stop()
|
66 |
+
|
67 |
+
if sizes['QUERY'] >= 10:
|
68 |
+
extracted_documents = documents['QUERY']
|
69 |
+
elif sizes['AND'] >= 10:
|
70 |
+
extracted_documents = documents['AND']
|
71 |
+
elif sizes['OR'] >= 10:
|
72 |
+
extracted_documents = documents['OR']
|
73 |
+
else:
|
74 |
+
number_of_documents = sizes['OR']
|
75 |
+
raise FewDocumentsError(documents['OR'], number_of_documents,
|
76 |
+
f'Only {number_of_documents} documents found for the query "{query}"\n\
|
77 |
+
Please select continue to proceed with {number_of_documents} documents or try again with another query'
|
78 |
+
)
|
79 |
+
|
80 |
+
return extracted_documents, empty, sizes
|
81 |
+
|
82 |
+
def paragraph_extraction(documents, min_paragraph_size):
|
83 |
+
paragraphs = [
|
84 |
+
documents[i].splitlines()[j] for i in range(len(documents)) for j in range(len(documents[i].splitlines()))
|
85 |
+
if (len(documents[i].splitlines()[j].split()) > min_paragraph_size)
|
86 |
+
]
|
87 |
+
|
88 |
+
return paragraphs
|
89 |
+
|
90 |
+
def semantic_search(model, query, files, number_of_similar_files):
|
91 |
+
encoded_query = model.encode(query, device=device)
|
92 |
+
encoded_files = model.encode(files, device=device)
|
93 |
+
|
94 |
+
model_index = nmslib.init(method='hnsw', space='angulardist')
|
95 |
+
model_index.addDataPointBatch(encoded_files)
|
96 |
+
model_index.createIndex({'post': 2})
|
97 |
+
|
98 |
+
ids, distances = model_index.knnQuery(encoded_query, k=number_of_similar_files)
|
99 |
+
|
100 |
+
selected_files = [files[index] for index in ids]
|
101 |
+
|
102 |
+
distances = 180*distances/np.pi
|
103 |
+
|
104 |
+
return selected_files, distances;
|
extractor/extract.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
from ._utils import FewDocumentsError
|
3 |
+
from ._utils import document_extraction, paragraph_extraction, semantic_search
|
4 |
+
from corpora import gen_corpus
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from nltk.tokenize import word_tokenize
|
7 |
+
import string
|
8 |
+
|
9 |
+
from ._utils import device
|
10 |
+
|
11 |
+
def extract(query: str, n: int=3, extracted_documents: list=None) -> str:
|
12 |
+
"""Extract n paragraphs from the corpus using the given query.
|
13 |
+
|
14 |
+
Parameters:
|
15 |
+
query (str): Sentence used to search the corpus for relevant documents
|
16 |
+
n (int): Number of paragraphs to return
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
str: String containing the n most relevant paragraphs joined by line breaks
|
20 |
+
"""
|
21 |
+
# Open corpus
|
22 |
+
corpus = gen_corpus(query)
|
23 |
+
|
24 |
+
# Setup query
|
25 |
+
stop_words = set(stopwords.words('english'))
|
26 |
+
query_tokens = word_tokenize(query.lower())
|
27 |
+
tokens_without_sw = [word for word in query_tokens if not word in stop_words]
|
28 |
+
keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]
|
29 |
+
|
30 |
+
# Gross search
|
31 |
+
if not extracted_documents:
|
32 |
+
extracted_documents, documents_empty, documents_sizes = document_extraction(
|
33 |
+
dataset=corpus,
|
34 |
+
query=query,
|
35 |
+
keywords=keywords,
|
36 |
+
min_document_size=0,
|
37 |
+
min_just_one_paragraph_size=0
|
38 |
+
)
|
39 |
+
|
40 |
+
# First semantc search (over documents)
|
41 |
+
# Model for semantic searches
|
42 |
+
search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device)
|
43 |
+
selected_documents, documents_distances = semantic_search(
|
44 |
+
model=search_model,
|
45 |
+
query=query,
|
46 |
+
files=extracted_documents,
|
47 |
+
number_of_similar_files=10
|
48 |
+
)
|
49 |
+
|
50 |
+
# Second semantic search (over paragraphs)
|
51 |
+
paragraphs = paragraph_extraction(
|
52 |
+
documents=selected_documents,
|
53 |
+
min_paragraph_size=20,
|
54 |
+
)
|
55 |
+
|
56 |
+
# Model for the second semantic search
|
57 |
+
selected_paragraphs, paragraphs_distances = semantic_search(
|
58 |
+
model=search_model,
|
59 |
+
query=query,
|
60 |
+
files=paragraphs,
|
61 |
+
number_of_similar_files=10
|
62 |
+
)
|
63 |
+
|
64 |
+
from pprint import pprint
|
65 |
+
pprint(selected_paragraphs[:n])
|
66 |
+
|
67 |
+
text = '\n'.join(selected_paragraphs[:n])
|
68 |
+
|
69 |
+
return text
|
70 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==4.1.0
|
2 |
+
argh @ file:///home/conda/feedstock_root/build_artifacts/argh_1595627874344/work
|
3 |
+
argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1633990448879/work
|
4 |
+
astor @ file:///home/conda/feedstock_root/build_artifacts/astor_1593610464257/work
|
5 |
+
async-generator==1.10
|
6 |
+
attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1620387926260/work
|
7 |
+
backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
|
8 |
+
backports.functools-lru-cache @ file:///home/conda/feedstock_root/build_artifacts/backports.functools_lru_cache_1618230623929/work
|
9 |
+
base58 @ file:///home/conda/feedstock_root/build_artifacts/base58_1610222858736/work
|
10 |
+
beautifulsoup4 @ file:///home/conda/feedstock_root/build_artifacts/beautifulsoup4_1631087867185/work
|
11 |
+
bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1629908509068/work
|
12 |
+
blinker==1.4
|
13 |
+
blis @ file:///home/conda/feedstock_root/build_artifacts/cython-blis_1607338152218/work
|
14 |
+
boto3 @ file:///home/conda/feedstock_root/build_artifacts/boto3_1633765668486/work
|
15 |
+
botocore @ file:///home/conda/feedstock_root/build_artifacts/botocore_1633987764519/work
|
16 |
+
brotlipy==0.7.0
|
17 |
+
cachetools @ file:///home/conda/feedstock_root/build_artifacts/cachetools_1633010882559/work
|
18 |
+
catalogue @ file:///home/conda/feedstock_root/build_artifacts/catalogue_1632148432306/work
|
19 |
+
certifi==2021.10.8
|
20 |
+
cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1631636250774/work
|
21 |
+
charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1632423846401/work
|
22 |
+
click==7.1.2
|
23 |
+
colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1602866480661/work
|
24 |
+
cryptography @ file:///tmp/build/80754af9/cryptography_1633520369886/work
|
25 |
+
cupy @ file:///home/conda/feedstock_root/build_artifacts/cupy_1633141077366/work
|
26 |
+
cymem @ file:///home/conda/feedstock_root/build_artifacts/cymem_1625012218510/work
|
27 |
+
dataclasses @ file:///home/conda/feedstock_root/build_artifacts/dataclasses_1628958434797/work
|
28 |
+
debugpy @ file:///home/conda/feedstock_root/build_artifacts/debugpy_1627074853231/work
|
29 |
+
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1631346842025/work
|
30 |
+
defusedxml @ file:///home/conda/feedstock_root/build_artifacts/defusedxml_1615232257335/work
|
31 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl
|
32 |
+
entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1605121927639/work/dist/entrypoints-0.3-py2.py3-none-any.whl
|
33 |
+
fastrlock==0.6
|
34 |
+
filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1633273934192/work
|
35 |
+
gitdb @ file:///tmp/build/80754af9/gitdb_1617117951232/work
|
36 |
+
GitPython @ file:///home/conda/feedstock_root/build_artifacts/gitpython_1632423794953/work
|
37 |
+
huggingface-hub @ file:///home/conda/feedstock_root/build_artifacts/huggingface_hub_1633615623676/work
|
38 |
+
idna @ file:///tmp/build/80754af9/idna_1622654382723/work
|
39 |
+
importlib-metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1630267473458/work
|
40 |
+
inflect @ file:///home/conda/feedstock_root/build_artifacts/inflect_1635835916074/work
|
41 |
+
ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1631291098355/work/dist/ipykernel-6.4.1-py3-none-any.whl
|
42 |
+
ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1632763773116/work
|
43 |
+
ipython-genutils==0.2.0
|
44 |
+
ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1631590360471/work
|
45 |
+
jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1610146787869/work
|
46 |
+
Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1633656206378/work
|
47 |
+
jmespath @ file:///home/conda/feedstock_root/build_artifacts/jmespath_1589369830981/work
|
48 |
+
joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1633637554808/work
|
49 |
+
jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema_1633875207482/work
|
50 |
+
jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1633454794268/work
|
51 |
+
jupyter-core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1631852705892/work
|
52 |
+
jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1601375948261/work
|
53 |
+
jupyterlab-widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1631590465624/work
|
54 |
+
MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1621455668600/work
|
55 |
+
matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1631080358261/work
|
56 |
+
mistune @ file:///home/conda/feedstock_root/build_artifacts/mistune_1624941317779/work
|
57 |
+
mkl-fft==1.3.0
|
58 |
+
mkl-random==1.2.2
|
59 |
+
mkl-service==2.4.0
|
60 |
+
murmurhash @ file:///home/conda/feedstock_root/build_artifacts/murmurhash_1607334222000/work
|
61 |
+
nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1629120697898/work
|
62 |
+
nbconvert @ file:///home/conda/feedstock_root/build_artifacts/nbconvert_1632535927841/work
|
63 |
+
nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1617383142101/work
|
64 |
+
nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1617163391303/work
|
65 |
+
nltk @ file:///home/conda/feedstock_root/build_artifacts/nltk_1633093058893/work
|
66 |
+
nmslib==2.0.16
|
67 |
+
notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1631733685426/work
|
68 |
+
numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1629092040774/work
|
69 |
+
olefile @ file:///home/conda/feedstock_root/build_artifacts/olefile_1602866521163/work
|
70 |
+
packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1625323647219/work
|
71 |
+
pandas==1.2.5
|
72 |
+
pandocfilters @ file:///home/conda/feedstock_root/build_artifacts/pandocfilters_1631603243851/work
|
73 |
+
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1617148930513/work
|
74 |
+
pathy @ file:///home/conda/feedstock_root/build_artifacts/pathy_1624897245984/work
|
75 |
+
pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1602535608087/work
|
76 |
+
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
|
77 |
+
Pillow @ file:///tmp/build/80754af9/pillow_1625655408865/work
|
78 |
+
preshed @ file:///home/conda/feedstock_root/build_artifacts/preshed_1625048866397/work
|
79 |
+
prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1622586138406/work
|
80 |
+
prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1629903925368/work
|
81 |
+
protobuf==3.17.2
|
82 |
+
ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
|
83 |
+
pyarrow==3.0.0
|
84 |
+
pybind11 @ file:///home/conda/feedstock_root/build_artifacts/pybind11-split_1633408728392/work
|
85 |
+
pybind11-global @ file:///home/conda/feedstock_root/build_artifacts/pybind11-split_1633408728392/work
|
86 |
+
pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1593275161868/work
|
87 |
+
pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1620819959703/work
|
88 |
+
pydeck @ file:///home/conda/feedstock_root/build_artifacts/pydeck_1630093567483/work
|
89 |
+
Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1629119114968/work
|
90 |
+
pyOpenSSL @ file:///home/conda/feedstock_root/build_artifacts/pyopenssl_1633192417276/work
|
91 |
+
pyparsing==2.4.7
|
92 |
+
pyrsistent @ file:///tmp/build/80754af9/pyrsistent_1625052304482/work
|
93 |
+
PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1610291451001/work
|
94 |
+
python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work
|
95 |
+
python-dotenv @ file:///home/conda/feedstock_root/build_artifacts/python-dotenv_1633777049699/work
|
96 |
+
pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1633452062248/work
|
97 |
+
PyYAML==5.4.1
|
98 |
+
pyzmq @ file:///home/conda/feedstock_root/build_artifacts/pyzmq_1631793304627/work
|
99 |
+
regex @ file:///home/conda/feedstock_root/build_artifacts/regex_1633750069182/work
|
100 |
+
requests @ file:///tmp/build/80754af9/requests_1629994808627/work
|
101 |
+
s3transfer @ file:///home/conda/feedstock_root/build_artifacts/s3transfer_1626384238958/work
|
102 |
+
sacremoses @ file:///home/conda/feedstock_root/build_artifacts/sacremoses_1588857588528/work
|
103 |
+
scikit-learn @ file:///tmp/build/80754af9/scikit-learn_1621370406642/work
|
104 |
+
scipy @ file:///tmp/build/80754af9/scipy_1630606796912/work
|
105 |
+
Send2Trash @ file:///home/conda/feedstock_root/build_artifacts/send2trash_1628511208346/work
|
106 |
+
sentence-transformers @ file:///home/conda/feedstock_root/build_artifacts/sentence-transformers_1626294390713/work
|
107 |
+
shellingham @ file:///home/conda/feedstock_root/build_artifacts/shellingham_1612179560728/work
|
108 |
+
six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
|
109 |
+
smart-open @ file:///home/conda/feedstock_root/build_artifacts/smart_open_1630238320325/work
|
110 |
+
smmap @ file:///home/conda/feedstock_root/build_artifacts/smmap_1612632339838/work
|
111 |
+
soupsieve @ file:///tmp/build/80754af9/soupsieve_1616183228191/work
|
112 |
+
spacy @ file:///home/conda/feedstock_root/build_artifacts/spacy_1632244522110/work
|
113 |
+
spacy-legacy @ file:///home/conda/feedstock_root/build_artifacts/spacy-legacy_1625687473390/work
|
114 |
+
srsly @ file:///home/conda/feedstock_root/build_artifacts/srsly_1618231647618/work
|
115 |
+
streamlit @ file:///home/conda/feedstock_root/build_artifacts/streamlit_1633453177811/work
|
116 |
+
terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1631128166466/work
|
117 |
+
testpath @ file:///home/conda/feedstock_root/build_artifacts/testpath_1621261527237/work
|
118 |
+
thinc @ file:///home/conda/feedstock_root/build_artifacts/thinc_1632600737641/work
|
119 |
+
threadpoolctl @ file:///home/conda/feedstock_root/build_artifacts/threadpoolctl_1633102299089/work
|
120 |
+
tokenizers @ file:///home/conda/feedstock_root/build_artifacts/tokenizers_1632285667965/work
|
121 |
+
toml @ file:///home/conda/feedstock_root/build_artifacts/toml_1604308577558/work
|
122 |
+
toolz @ file:///home/conda/feedstock_root/build_artifacts/toolz_1600973991856/work
|
123 |
+
torch==1.9.1
|
124 |
+
torchaudio==0.9.0a0+a85b239
|
125 |
+
torchvision==0.10.1
|
126 |
+
tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1610094708661/work
|
127 |
+
tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1632160078689/work
|
128 |
+
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1630423529112/work
|
129 |
+
transformers @ file:///home/conda/feedstock_root/build_artifacts/transformers_1633051764196/work
|
130 |
+
typer @ file:///home/conda/feedstock_root/build_artifacts/typer_1630326630489/work
|
131 |
+
typing-extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1632313171031/work
|
132 |
+
tzlocal @ file:///home/conda/feedstock_root/build_artifacts/tzlocal_1629721600364/work
|
133 |
+
urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1632350318291/work
|
134 |
+
validators @ file:///home/conda/feedstock_root/build_artifacts/validators_1608296160673/work
|
135 |
+
wasabi @ file:///home/conda/feedstock_root/build_artifacts/wasabi_1612156086016/work
|
136 |
+
watchdog @ file:///home/conda/feedstock_root/build_artifacts/watchdog_1633154568778/work
|
137 |
+
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1600965781394/work
|
138 |
+
webencodings==0.5.1
|
139 |
+
widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1605475529901/work
|
140 |
+
zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1633302054558/work
|
summarizer/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .summarize import summarize
|
summarizer/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (167 Bytes). View file
|
|
summarizer/__pycache__/_utils.cpython-39.pyc
ADDED
Binary file (115 Bytes). View file
|
|
summarizer/__pycache__/summarize.cpython-39.pyc
ADDED
Binary file (1.04 kB). View file
|
|
summarizer/_utils.py
ADDED
File without changes
|
summarizer/summarize.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
|
3 |
+
def summarize(text: str) -> str:
|
4 |
+
"""
|
5 |
+
Generate a summary based from the given text
|
6 |
+
"""
|
7 |
+
|
8 |
+
# Model for abstraction
|
9 |
+
model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained('t5-base')
|
11 |
+
|
12 |
+
input_tokens = tokenizer.encode(
|
13 |
+
f'summarize: {text}',
|
14 |
+
return_tensors='pt',
|
15 |
+
max_length=tokenizer.model_max_length,
|
16 |
+
truncation=True
|
17 |
+
)
|
18 |
+
|
19 |
+
summary_ids = model.generate(
|
20 |
+
input_tokens,
|
21 |
+
min_length=80,
|
22 |
+
max_length=150,
|
23 |
+
length_penalty=15,
|
24 |
+
num_beams=2
|
25 |
+
)
|
26 |
+
|
27 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
28 |
+
|
29 |
+
summary = '. '.join([phrase.capitalize() for phrase in summary.split('. ')])
|
30 |
+
if not summary[-1] == '.':
|
31 |
+
summary = summary + '.'
|
32 |
+
|
33 |
+
return summary
|