Spaces:

autosummproject
/

autosumm

Runtime error

App Files Files Community

mhsvieira commited on Nov 16, 2021

Commit

e539b70

•

1 Parent(s): 515969f

Add current system

Browse files

Files changed (25) hide show

app.py +57 -0
corpora/__init__.py +1 -0
corpora/__pycache__/__init__.cpython-39.pyc +0 -0
corpora/__pycache__/corpora.cpython-39.pyc +0 -0
corpora/__pycache__/pira.cpython-39.pyc +0 -0
corpora/__pycache__/sourcer.cpython-39.pyc +0 -0
corpora/corpora.py +21 -0
corpora/pira.py +7 -0
corpora/sourcer.py +34 -0
data/README.md +1 -0
data/datasets/pira_simplified.csv +0 -0
data/datasets/pira_simplified.csv:Zone.Identifier +3 -0
extractor/__init__.py +2 -0
extractor/__pycache__/__init__.cpython-39.pyc +0 -0
extractor/__pycache__/_utils.cpython-39.pyc +0 -0
extractor/__pycache__/extract.cpython-39.pyc +0 -0
extractor/_utils.py +104 -0
extractor/extract.py +70 -0
requirements.txt +140 -0
summarizer/__init__.py +1 -0
summarizer/__pycache__/__init__.cpython-39.pyc +0 -0
summarizer/__pycache__/_utils.cpython-39.pyc +0 -0
summarizer/__pycache__/summarize.cpython-39.pyc +0 -0
summarizer/_utils.py +0 -0
summarizer/summarize.py +33 -0

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+from extractor import extract, FewDocumentsError
+from summarizer import summarize
+import time
+# TODO: translation
+st.title("Trabalho de Formatura - Construindo textos para a internet")
+st.subheader("Lucas Antunes e Matheus Vieira")
+st.subheader("Digite o tópico sobre o qual você deseja gerar um resumo")
+query = st.text_input('Digite o tópico em inglês') #text is stored in this variable
+if 'few_documents' not in st.session_state:
+    st.session_state['few_documents'] = False
+    few_documents = False
+else:
+    few_documents = st.session_state['few_documents']
+button1 = st.button('Gerar resumo')
+if button1:
+    start_time = time.time()
+    try:
+        with st.spinner('Extraindo textos relevantes...'):
+            text = extract(query)
+    except FewDocumentsError as e:
+        few_documents = True
+        st.session_state['few_documents'] = True
+        st.session_state['documents'] = e.documents
+        st.session_state['msg'] = e.msg
+    else:
+        st.info(f'(Extraction) Elapsed time: {time.time() - start_time:.2f}s')
+        with st.spinner('Gerando resumo...'):
+            summary = summarize(text)
+        st.info(f'(Total) Elapsed time: {time.time() - start_time:.2f}s')
+        st.markdown(f'Seu resumo para "{query}":\n\n> {summary}')
+if few_documents:
+    st.warning(st.session_state['msg'])
+    if st.button('Prosseguir'):
+        start_time = time.time()
+        with st.spinner('Extraindo textos relevantes...'):
+            text = extract(query, extracted_documents=st.session_state['documents'])
+        st.info(f'(Extraction) Elapsed time: {time.time() - start_time:.2f}s')
+        with st.spinner('Gerando resumo...'):
+            summary = summarize(text)
+        st.info(f'(Total) Elapsed time: {time.time() - start_time:.2f}s')
+        st.markdown(f'Seu resumo para "{query}":\n\n> {summary}')
+        st.session_state['few_documents'] = False
+        few_documents = False

corpora/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .corpora import gen_corpus

corpora/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (176 Bytes). View file

corpora/__pycache__/corpora.cpython-39.pyc ADDED Viewed

Binary file (613 Bytes). View file

corpora/__pycache__/pira.cpython-39.pyc ADDED Viewed

Binary file (304 Bytes). View file

corpora/__pycache__/sourcer.cpython-39.pyc ADDED Viewed

Binary file (1.14 kB). View file

corpora/corpora.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .sourcer import search_web
+import pandas as pd
+import os
+root_dir = 'data/datasets'
+pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
+def gen_corpus(query: str, pira: bool=True, ONU: bool=True, web: bool=True)->list:
+    corpus = []
+    if not (pira or ONU or web):
+        # TODO: raise error
+        pass
+    if pira:
+        corpus += pira_df.text.to_list()
+    if ONU:
+        # TODO: implement PDFs
+        pass
+    if web:
+        corpus += search_web(query)
+    return corpus

corpora/pira.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import pandas as pd
+import os
+# Open dataset
+root_dir = 'data/datasets'
+pira_df = pd.read_csv(os.path.join(root_dir, 'pira_simplified.csv'))
+pira = pira_df.text.to_list()

corpora/sourcer.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import requests
+from string import Template
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+from os import getenv
+load_dotenv()
+google_key = getenv('GOOGLE_KEY')
+google_engine = getenv('GOOGLE_ENGINE')
+url = Template(f'https://www.googleapis.com/customsearch/v1?key={google_key}&cx={google_engine}&q=$query')
+def search_web(query: str) -> list:
+    query = '+'.join(query.split())
+    results = requests.get(url.substitute(query=query)).json()['items']
+    links = [item['link'] for item in results]
+    texts = []
+    for link in links:
+        resp = requests.get(link)
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        text = []
+        # remove lists
+        for tag in soup.find_all('li'):
+            tag.extract()
+        tags = soup.find_all('p')
+        for tag in tags:
+            text.append(tag.text)
+        texts.append('\n'.join(text))
+    return texts

data/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Store data here.

data/datasets/pira_simplified.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/datasets/pira_simplified.csv:Zone.Identifier ADDED Viewed

	@@ -0,0 +1,3 @@

+[ZoneTransfer]
+ZoneId=3
+HostUrl=https://doc-0s-14-docs.googleusercontent.com/docs/securesc/l4sk7borm3s5kl6jhblek6c66jbnuvpo/kcpqkljqgl6hfdmmo6pq7s9l9msu1pqg/1633831125000/13208144011500786805/13208144011500786805/1-3OY2MTvTqLoOtkD2-iQrp9lbRMfEReh?e=download&authuser=2&nonce=r5s0v4hsue126&user=13208144011500786805&hash=v8qg2imo0qo63hdo4j6qker2n7b7ijto

extractor/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .extract import extract
2	+ from ._utils import FewDocumentsError

extractor/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (214 Bytes). View file

extractor/__pycache__/_utils.cpython-39.pyc ADDED Viewed

Binary file (4.82 kB). View file

extractor/__pycache__/extract.cpython-39.pyc ADDED Viewed

Binary file (2.13 kB). View file

extractor/_utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import nmslib
+import numpy as np
+import streamlit as st
+import inflect
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+p = inflect.engine()
+class FewDocumentsError(Exception):
+  def __init__(self, documents, size, msg):
+    self.documents = documents
+    self.size = size
+    self.msg = msg
+  def __str__(self):
+    return repr(self.msg)
+def document_extraction(dataset, query, keywords, min_document_size, min_just_one_paragraph_size):
+  word_in_text = lambda word, text: any([p.compare(word, w) for w in text.split()])
+  lower_dataset = [document.lower() for document in dataset]
+  lower_query = query.lower()
+  lower_keywords = [keyword.lower() for keyword in keywords]
+  documents = {}
+  documents['QUERY'] = [
+    dataset[lower_dataset.index(document)] for document in lower_dataset
+    if (word_in_text(lower_query, document))
+    and (len(document.split()) > min_document_size)
+    and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
+  ]
+  documents['AND'] = [
+    dataset[lower_dataset.index(document)] for document in lower_dataset
+    if all(word_in_text(keyword, document) for keyword in lower_keywords)
+    and (len(document.split()) > min_document_size)
+    and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
+  ]
+  documents['OR'] = [
+    dataset[lower_dataset.index(document)] for document in lower_dataset
+    if any(word_in_text(keyword, document) for keyword in lower_keywords)
+    and (len(document.split()) > min_document_size)
+    and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
+  ]
+  empty = {
+    'QUERY': len(documents['QUERY']) == 0,
+    'AND': len(documents['AND']) == 0,
+    'OR': len(documents['OR']) == 0
+  }
+  sizes = {
+    'QUERY': len(documents['QUERY']),
+    'AND': len(documents['AND']),
+    'OR': len(documents['OR'])
+  }
+  if all(empty.values()):
+        # TODO: throw error
+        st.info(empty.values())
+        st.warning(f'No document found for the query "{query}", please try with another query')
+        st.stop()
+  if sizes['QUERY'] >= 10:
+      extracted_documents = documents['QUERY']
+  elif sizes['AND'] >= 10:
+      extracted_documents = documents['AND']
+  elif sizes['OR'] >= 10:
+      extracted_documents = documents['OR']
+  else:
+      number_of_documents = sizes['OR']
+      raise FewDocumentsError(documents['OR'], number_of_documents,
+        f'Only {number_of_documents} documents found for the query "{query}"\n\
+        Please select continue to proceed with {number_of_documents} documents or try again with another query'
+      )
+  return extracted_documents, empty, sizes
+def paragraph_extraction(documents, min_paragraph_size):
+  paragraphs = [
+    documents[i].splitlines()[j] for i in range(len(documents)) for j in range(len(documents[i].splitlines()))
+    if (len(documents[i].splitlines()[j].split()) > min_paragraph_size)
+  ]
+  return paragraphs
+def semantic_search(model, query, files, number_of_similar_files):
+  encoded_query = model.encode(query, device=device)
+  encoded_files = model.encode(files, device=device)
+  model_index = nmslib.init(method='hnsw', space='angulardist')
+  model_index.addDataPointBatch(encoded_files)
+  model_index.createIndex({'post': 2})
+  ids, distances = model_index.knnQuery(encoded_query, k=number_of_similar_files)
+  selected_files = [files[index] for index in ids]
+  distances = 180*distances/np.pi
+  return selected_files, distances;

extractor/extract.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from sentence_transformers import SentenceTransformer
+from ._utils import FewDocumentsError
+from ._utils import document_extraction, paragraph_extraction, semantic_search
+from corpora import gen_corpus
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import string
+from ._utils import device
+def extract(query: str, n: int=3, extracted_documents: list=None) -> str:
+    """Extract n paragraphs from the corpus using the given query.
+    Parameters:
+    query (str): Sentence used to search the corpus for relevant documents
+    n (int): Number of paragraphs to return
+    Returns:
+    str: String containing the n most relevant paragraphs joined by line breaks
+    """
+    # Open corpus
+    corpus = gen_corpus(query)
+    # Setup query
+    stop_words = set(stopwords.words('english'))
+    query_tokens = word_tokenize(query.lower())
+    tokens_without_sw = [word for word in query_tokens if not word in stop_words]
+    keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]
+    # Gross search
+    if not extracted_documents:
+        extracted_documents, documents_empty, documents_sizes = document_extraction(
+            dataset=corpus,
+            query=query,
+            keywords=keywords,
+            min_document_size=0,
+            min_just_one_paragraph_size=0
+        )
+    # First semantc search (over documents)
+    # Model for semantic searches
+    search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device)
+    selected_documents, documents_distances = semantic_search(
+        model=search_model,
+        query=query,
+        files=extracted_documents,
+        number_of_similar_files=10
+    )
+    # Second semantic search (over paragraphs)
+    paragraphs = paragraph_extraction(
+        documents=selected_documents,
+        min_paragraph_size=20,
+    )
+    # Model for the second semantic search
+    selected_paragraphs, paragraphs_distances = semantic_search(
+        model=search_model,
+        query=query,
+        files=paragraphs,
+        number_of_similar_files=10
+    )
+    from pprint import pprint
+    pprint(selected_paragraphs[:n])
+    text = '\n'.join(selected_paragraphs[:n])
+    return text

requirements.txt ADDED Viewed

	@@ -0,0 +1,140 @@

+altair==4.1.0
+argh @ file:///home/conda/feedstock_root/build_artifacts/argh_1595627874344/work
+argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1633990448879/work
+astor @ file:///home/conda/feedstock_root/build_artifacts/astor_1593610464257/work
+async-generator==1.10
+attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1620387926260/work
+backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
+backports.functools-lru-cache @ file:///home/conda/feedstock_root/build_artifacts/backports.functools_lru_cache_1618230623929/work
+base58 @ file:///home/conda/feedstock_root/build_artifacts/base58_1610222858736/work
+beautifulsoup4 @ file:///home/conda/feedstock_root/build_artifacts/beautifulsoup4_1631087867185/work
+bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1629908509068/work
+blinker==1.4
+blis @ file:///home/conda/feedstock_root/build_artifacts/cython-blis_1607338152218/work
+boto3 @ file:///home/conda/feedstock_root/build_artifacts/boto3_1633765668486/work
+botocore @ file:///home/conda/feedstock_root/build_artifacts/botocore_1633987764519/work
+brotlipy==0.7.0
+cachetools @ file:///home/conda/feedstock_root/build_artifacts/cachetools_1633010882559/work
+catalogue @ file:///home/conda/feedstock_root/build_artifacts/catalogue_1632148432306/work
+certifi==2021.10.8
+cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1631636250774/work
+charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1632423846401/work
+click==7.1.2
+colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1602866480661/work
+cryptography @ file:///tmp/build/80754af9/cryptography_1633520369886/work
+cupy @ file:///home/conda/feedstock_root/build_artifacts/cupy_1633141077366/work
+cymem @ file:///home/conda/feedstock_root/build_artifacts/cymem_1625012218510/work
+dataclasses @ file:///home/conda/feedstock_root/build_artifacts/dataclasses_1628958434797/work
+debugpy @ file:///home/conda/feedstock_root/build_artifacts/debugpy_1627074853231/work
+decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1631346842025/work
+defusedxml @ file:///home/conda/feedstock_root/build_artifacts/defusedxml_1615232257335/work
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl
+entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1605121927639/work/dist/entrypoints-0.3-py2.py3-none-any.whl
+fastrlock==0.6
+filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1633273934192/work
+gitdb @ file:///tmp/build/80754af9/gitdb_1617117951232/work
+GitPython @ file:///home/conda/feedstock_root/build_artifacts/gitpython_1632423794953/work
+huggingface-hub @ file:///home/conda/feedstock_root/build_artifacts/huggingface_hub_1633615623676/work
+idna @ file:///tmp/build/80754af9/idna_1622654382723/work
+importlib-metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1630267473458/work
+inflect @ file:///home/conda/feedstock_root/build_artifacts/inflect_1635835916074/work
+ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1631291098355/work/dist/ipykernel-6.4.1-py3-none-any.whl
+ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1632763773116/work
+ipython-genutils==0.2.0
+ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1631590360471/work
+jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1610146787869/work
+Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1633656206378/work
+jmespath @ file:///home/conda/feedstock_root/build_artifacts/jmespath_1589369830981/work
+joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1633637554808/work
+jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema_1633875207482/work
+jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1633454794268/work
+jupyter-core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1631852705892/work
+jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1601375948261/work
+jupyterlab-widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1631590465624/work
+MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1621455668600/work
+matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1631080358261/work
+mistune @ file:///home/conda/feedstock_root/build_artifacts/mistune_1624941317779/work
+mkl-fft==1.3.0
+mkl-random==1.2.2
+mkl-service==2.4.0
+murmurhash @ file:///home/conda/feedstock_root/build_artifacts/murmurhash_1607334222000/work
+nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1629120697898/work
+nbconvert @ file:///home/conda/feedstock_root/build_artifacts/nbconvert_1632535927841/work
+nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1617383142101/work
+nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1617163391303/work
+nltk @ file:///home/conda/feedstock_root/build_artifacts/nltk_1633093058893/work
+nmslib==2.0.16
+notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1631733685426/work
+numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1629092040774/work
+olefile @ file:///home/conda/feedstock_root/build_artifacts/olefile_1602866521163/work
+packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1625323647219/work
+pandas==1.2.5
+pandocfilters @ file:///home/conda/feedstock_root/build_artifacts/pandocfilters_1631603243851/work
+parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1617148930513/work
+pathy @ file:///home/conda/feedstock_root/build_artifacts/pathy_1624897245984/work
+pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1602535608087/work
+pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
+Pillow @ file:///tmp/build/80754af9/pillow_1625655408865/work
+preshed @ file:///home/conda/feedstock_root/build_artifacts/preshed_1625048866397/work
+prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1622586138406/work
+prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1629903925368/work
+protobuf==3.17.2
+ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
+pyarrow==3.0.0
+pybind11 @ file:///home/conda/feedstock_root/build_artifacts/pybind11-split_1633408728392/work
+pybind11-global @ file:///home/conda/feedstock_root/build_artifacts/pybind11-split_1633408728392/work
+pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1593275161868/work
+pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1620819959703/work
+pydeck @ file:///home/conda/feedstock_root/build_artifacts/pydeck_1630093567483/work
+Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1629119114968/work
+pyOpenSSL @ file:///home/conda/feedstock_root/build_artifacts/pyopenssl_1633192417276/work
+pyparsing==2.4.7
+pyrsistent @ file:///tmp/build/80754af9/pyrsistent_1625052304482/work
+PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1610291451001/work
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work
+python-dotenv @ file:///home/conda/feedstock_root/build_artifacts/python-dotenv_1633777049699/work
+pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1633452062248/work
+PyYAML==5.4.1
+pyzmq @ file:///home/conda/feedstock_root/build_artifacts/pyzmq_1631793304627/work
+regex @ file:///home/conda/feedstock_root/build_artifacts/regex_1633750069182/work
+requests @ file:///tmp/build/80754af9/requests_1629994808627/work
+s3transfer @ file:///home/conda/feedstock_root/build_artifacts/s3transfer_1626384238958/work
+sacremoses @ file:///home/conda/feedstock_root/build_artifacts/sacremoses_1588857588528/work
+scikit-learn @ file:///tmp/build/80754af9/scikit-learn_1621370406642/work
+scipy @ file:///tmp/build/80754af9/scipy_1630606796912/work
+Send2Trash @ file:///home/conda/feedstock_root/build_artifacts/send2trash_1628511208346/work
+sentence-transformers @ file:///home/conda/feedstock_root/build_artifacts/sentence-transformers_1626294390713/work
+shellingham @ file:///home/conda/feedstock_root/build_artifacts/shellingham_1612179560728/work
+six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
+smart-open @ file:///home/conda/feedstock_root/build_artifacts/smart_open_1630238320325/work
+smmap @ file:///home/conda/feedstock_root/build_artifacts/smmap_1612632339838/work
+soupsieve @ file:///tmp/build/80754af9/soupsieve_1616183228191/work
+spacy @ file:///home/conda/feedstock_root/build_artifacts/spacy_1632244522110/work
+spacy-legacy @ file:///home/conda/feedstock_root/build_artifacts/spacy-legacy_1625687473390/work
+srsly @ file:///home/conda/feedstock_root/build_artifacts/srsly_1618231647618/work
+streamlit @ file:///home/conda/feedstock_root/build_artifacts/streamlit_1633453177811/work
+terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1631128166466/work
+testpath @ file:///home/conda/feedstock_root/build_artifacts/testpath_1621261527237/work
+thinc @ file:///home/conda/feedstock_root/build_artifacts/thinc_1632600737641/work
+threadpoolctl @ file:///home/conda/feedstock_root/build_artifacts/threadpoolctl_1633102299089/work
+tokenizers @ file:///home/conda/feedstock_root/build_artifacts/tokenizers_1632285667965/work
+toml @ file:///home/conda/feedstock_root/build_artifacts/toml_1604308577558/work
+toolz @ file:///home/conda/feedstock_root/build_artifacts/toolz_1600973991856/work
+torch==1.9.1
+torchaudio==0.9.0a0+a85b239
+torchvision==0.10.1
+tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1610094708661/work
+tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1632160078689/work
+traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1630423529112/work
+transformers @ file:///home/conda/feedstock_root/build_artifacts/transformers_1633051764196/work
+typer @ file:///home/conda/feedstock_root/build_artifacts/typer_1630326630489/work
+typing-extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1632313171031/work
+tzlocal @ file:///home/conda/feedstock_root/build_artifacts/tzlocal_1629721600364/work
+urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1632350318291/work
+validators @ file:///home/conda/feedstock_root/build_artifacts/validators_1608296160673/work
+wasabi @ file:///home/conda/feedstock_root/build_artifacts/wasabi_1612156086016/work
+watchdog @ file:///home/conda/feedstock_root/build_artifacts/watchdog_1633154568778/work
+wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1600965781394/work
+webencodings==0.5.1
+widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1605475529901/work
+zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1633302054558/work

summarizer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .summarize import summarize

summarizer/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (167 Bytes). View file

summarizer/__pycache__/_utils.cpython-39.pyc ADDED Viewed

Binary file (115 Bytes). View file

summarizer/__pycache__/summarize.cpython-39.pyc ADDED Viewed

Binary file (1.04 kB). View file

summarizer/_utils.py ADDED Viewed

File without changes

summarizer/summarize.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+def summarize(text: str) -> str:
+    """
+    Generate a summary based from the given text
+    """
+    # Model for abstraction
+    model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
+    tokenizer = AutoTokenizer.from_pretrained('t5-base')
+    input_tokens = tokenizer.encode(
+        f'summarize: {text}',
+        return_tensors='pt',
+        max_length=tokenizer.model_max_length,
+        truncation=True
+    )
+    summary_ids = model.generate(
+        input_tokens,
+        min_length=80,
+        max_length=150,
+        length_penalty=15,
+        num_beams=2
+    )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    summary = '. '.join([phrase.capitalize() for phrase in summary.split('. ')])
+    if not summary[-1] == '.':
+        summary = summary + '.'
+    return summary