File size: 2,106 Bytes
6bc94ac
436ce71
 
 
6bc94ac
db5ef00
aafa95b
6bc94ac
 
 
 
 
 
 
 
436ce71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aafa95b
436ce71
 
aafa95b
 
 
 
 
db5ef00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5beab45
db5ef00
 
 
 
 
 
 
aafa95b
db5ef00
 
436ce71
db5ef00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
import spacy
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel
import streamlit as st
from urllib.request import Request, urlopen, HTTPError
from bs4 import BeautifulSoup


def hide_footer():
    hide_st_style = """
            <style>
            footer {visibility: hidden;}
            </style>
            """
    st.markdown(hide_st_style, unsafe_allow_html=True)

@st.cache_resource
def get_seq2seq_model(model_id):
    return AutoModelForSeq2SeqLM.from_pretrained(model_id)

@st.cache_resource
def get_causal_model(model_id):
    return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

@st.cache_resource
def get_auto_model(model_id):
    return AutoModel.from_pretrained(model_id)

@st.cache_resource
def get_tokenizer(model_id):
    return AutoTokenizer.from_pretrained(model_id)

@st.cache_data
def get_celeb_data(fpath):
    with open(fpath, encoding='UTF-8') as json_file:
        return json.load(json_file)

def get_article(url):
    req = Request(
    url=url,
    headers={'User-Agent': 'Mozilla/5.0'}
    )
    try:
        html = urlopen(req).read()
        soup = BeautifulSoup(html, features="html.parser")

        # kill all script and style elements
        for script in soup(["script", "style"]):
            script.extract()    # rip it out

        lines = []

        # get text
        for para in soup.find_all("p", class_='topic-paragraph'):
            lines.append(para.get_text().strip())

        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # drop blank lines
        text = ' '.join(chunk for chunk in chunks if chunk)
        return text

    except:
        st.markdown("The internet is not stable.")
        return ""
    
@st.cache_resource
def get_spacy_model(model_id):
    return spacy.load(model_id)

def preprocess_text(name, text:str, model_id):
    spacy_model = get_spacy_model(model_id)
    texts = [i.text.strip() for i in spacy_model(text).sents]
    return spacy_model, texts