File size: 8,659 Bytes
c5b0957 d828c62 c5b0957 db5eefc ef00cba d828c62 ef00cba d828c62 c5b0957 d828c62 9604970 d828c62 4cf43fd d828c62 aabe8a0 d828c62 aabe8a0 d828c62 aabe8a0 d828c62 ef00cba a2bb83f ef00cba d828c62 ef00cba d828c62 ef00cba d828c62 ef00cba d828c62 ef00cba d828c62 ef00cba d828c62 ef00cba d828c62 9604970 d828c62 9604970 26d624c 9604970 aabe8a0 9604970 e517ab1 9604970 e517ab1 9604970 be67260 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
# streamlit_app.py manages the whole TopicDig process
from typing import List, Set
from collections import namedtuple
import random
import requests
import json
from codetiming import Timer
import streamlit as st
from digestor import Digestor
from source import Source
from scrape_sources import NPRLite, CNNText, stub
def initialize(limit, rando, use_cache=True):
clusters: dict[str:List[namedtuple]] = dict()
# This is a container for the source classes.
# Make sure you handle this. Whats the deal.
sources:List[Source]= [] # Write them and import? Read a config?
# FOR NOW ONLY add this explicitly here.
# MUST read in final version though.
sources.append(NPRLite(
'npr',
'https://text.npr.org/1001',
'sshleifer/distilbart-cnn-12-6',
'dbmdz/bert-large-cased-finetuned-conll03-english'
))
sources.append(CNNText(
'cnn',
'https://lite.cnn.com',
'sshleifer/distilbart-cnn-12-6',
'dbmdz/bert-large-cased-finetuned-conll03-english'
))
# initialize list to hold cluster data namedtuples
cluster_data: List[namedtuple('article', ['link','hed','entities', 'source'])]
article_dict : dict[str:namedtuple]
# For all sources retrieve_cluster_data
# returns List[namedtuples] with empty entity lists
cluster_data = []
article_meta = namedtuple('article_meta',['source', 'count'])
cluster_meta : List[article_meta] = []
for data_source in sources:
if limit is not None:
c_data, c_meta = data_source.retrieve_cluster_data(limit//len(sources))
else:
c_data, c_meta = data_source.retrieve_cluster_data()
cluster_data.append(c_data)
cluster_meta.append(article_meta(data_source.source_name, c_meta))
st.session_state[data_source.source_name] = f"Number of clusters from source: {data_source.source_name}\n\t{len(c_data)}"
print("Finished...moving on to clustering...")
cluster_data = cluster_data[0] + cluster_data[1]
# NER
# iterate the list of namedtuples,
for tup in cluster_data:
# pass each hed to the api query method, return the dict
# through the ner_results function to the 'entities' list.
# Populate stub entities list
perform_ner(tup, cache=use_cache)
generate_clusters(clusters, tup)
st.session_state['num_clusters'] = f"""Total number of clusters: {len(clusters)}"""
# Article stubs tracks all stubs
# If cluster is unsummarized, its hed's value is the namedtuple stub.
# Else reference digestor instance so summary can be found.
article_dict = {stub.hed: stub for stub in cluster_data}
return article_dict, clusters
# Am I going to use this for those two lines?
def perform_ner(tup:namedtuple('article',['link','hed','entities', 'source']), cache=True):
with Timer(name="ner_query_time", logger=None):
result = ner_results(ner_query(
{
"inputs":tup.hed,
"paramters":
{
"use_cache": cache,
},
}
))
for i in result:
tup.entities.append(i)
@st.cache()
def ner_query(payload):
print("making a query....")
data = json.dumps(payload)
response = requests.request("POST", NER_API_URL, headers=headers, data=data)
return json.loads(response.content.decode("utf-8"))
def generate_clusters(
the_dict: dict,
tup : namedtuple('article_stub',[ 'link','hed','entities', 'source'])
) -> dict:
for entity in tup.entities:
# Add cluster if entity not already in dict
if entity not in the_dict:
the_dict[entity] = []
# Add this article's link to the cluster dict
the_dict[entity].append(tup)
def ner_results(ner_object, groups=True, NER_THRESHOLD=0.5) -> List[str]:
# empty lists to collect our entities
people, places, orgs, misc = [], [], [], []
# 'ent' and 'designation' handle the difference between dictionary keys
# for aggregation strategy grouped vs ungrouped
ent = 'entity' if not groups else 'entity_group'
designation = 'I-' if not groups else ''
# Define actions -- this is a switch-case dictionary.
# keys are the identifiers used inthe return dict from
# the ner_query.
# values are list.append() for each of the lists
# created at the top of the function. They hold sorted entities.
# actions is used to pass entities into the lists.
# Why I called it actions I have no idea rename it.
actions = {designation+'PER':people.append,
designation+'LOC':places.append,
designation+'ORG':orgs.append,
designation+'MISC':misc.append
} # Is this an antipattern?
# For each dictionary in the ner result list, if the entity str doesn't contain a '#'
# and the confidence is > 90%, add the entity to the list for its type.
# actions[d[ent]](d['word']) accesses the key of actions that is returned
# from d[ent] and then passes the entity name, returned by d['word'] to
# the 'list.append' waiting to be called in the dict actions.
# Note the (). We access actions to call its append...
readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]
# create list of all entities to return
ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]
return ner_list
# These could be passed through the command line
# or read from a config file.
# One of these is needed here for NER and one in Digestor for summarization.
NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""}
LIMIT = 20 # Controls time and number of clusters.
USE_CACHE = True
if not USE_CACHE:
print("NOT USING CACHE--ARE YOU GATHERING DATA?")
if LIMIT is not None:
print(f"LIMIT: {LIMIT}")
# digest store
digests = dict() # key is cluster, value is digestor object
out_dicts = []
# list to accept user choices
# retrieve cluster data and create dict to track each article (articleStubs)
# and create topic clusters by performing ner.
print("Initializing....")
article_dict, clusters = initialize(LIMIT, USE_CACHE)
# We now have clusters and cluster data. Redundancy.
# We call a display function and get the user input.
# For this its still streamlit.
# button to refresh topics
if st.button("Refresh topics!"):
article_dict, clusters = initialize(LIMIT, USE_CACHE)
selections = []
choices = list(clusters.keys())
choices.insert(0,'None')
st.write(st.session_state['cnn'])
st.write(st.session_state['npr'])
st.write(st.session_state['num_clusters'])
# Form used to take 3 menu inputs
with st.form(key='columns_in_form'):
cols = st.columns(3)
for i, col in enumerate(cols):
selections.append(col.selectbox(f'Make a Selection', choices, key=i))
submitted = st.form_submit_button('Submit')
if submitted:
selections = [i for i in selections if i is not None]
with st.spinner(text="Digesting...please wait, this will take a few moments...Maybe check some messages or start reading the latest papers on summarization with transformers...."):
chosen = []
for i in selections: # i is supposed to be a list of stubs, mostly one
if i != 'None':
for j in clusters[i]:
if j not in chosen:
chosen.append(j) # j is a stub.
# Digestor uses 'chosen' to create digest.
# 'user_choicese' is passed for reference.
digestor = Digestor(timer=Timer(), cache = USE_CACHE, stubs=chosen, user_choices=list(selections))
# happens internally but may be used differently so it isn't automatic upon digestor creation.
# Easily turn caching off for testing.
digestor.digest() # creates summaries and stores them associated with the digest
# Get displayable digest and digest data
digestor.build_digest()
if len(digestor.text) == 0:
st.write("You didn't select a topic!")
else:
st.write("Your digest is ready:\n")
st.write(digestor.text)
"st.session_state object:", st.session_state
|