File size: 4,274 Bytes
55bdad4
86bb186
480b6a8
55bdad4
 
03251df
 
 
2d7ed25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50da02d
 
 
55bdad4
 
 
1ce306e
ee38f19
55bdad4
 
03251df
 
 
96db83a
a8e4a85
55bdad4
 
 
 
 
 
8b71a2e
55bdad4
 
 
ee38f19
55bdad4
 
 
 
 
a8e4a85
55bdad4
a8e4a85
55bdad4
2d7ed25
25dae2e
5a40ca9
f058f94
25dae2e
55bdad4
284129c
d543a3c
 
2d7ed25
 
 
55bdad4
2d7ed25
55bdad4
 
2d7ed25
 
 
55bdad4
 
2d7ed25
55bdad4
 
a96b11a
55bdad4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
from datasets import load_dataset
from transformers import pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast, AutoModelForSequenceClassification, AutoTokenizer, TFAutoModelForSequenceClassification

# Options for models from transformers library
MODEL_OPTS = ['finetuned', 'default', 'bertweet-base-sentiment-analysis', 'twitter-roberta-base', 'distilRoberta-financial-sentiment']
FINETUNED_OPT = MODEL_OPTS[0]
DEFAULT_OPT = MODEL_OPTS[1]

# Helper function
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

def load_abstracts():
    dataset_dict = load_dataset('HUPD/hupd',
        name='sample',
        data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
        icpr_label=None,
        train_filing_start_date='2016-01-01',
        train_filing_end_date='2016-01-31',
        val_filing_start_date='2016-01-01',
        val_filing_end_date='2016-01-01',
    )
    abstracts = dataset_dict['train']['abstract']
    dataset_dict = []
    return abstracts

# returns loaded model and tokenizer, if any
def load_model(opt):
    if opt not in MODEL_OPTS: print("Incorrect model selection. Try again!")
    model, tokenizer = None, None

    # Load the chosen sentiment analysis model from transformers
    if opt == FINETUNED_OPT:
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        model = DistilBertForSequenceClassification.from_pretrained('saccharinedreams/finetuned-distilbert-base-uncased-for-hupd')
    elif opt == DEFAULT_OPT:
        return model, tokenizer
    elif opt == 'bertweet-base-sentiment-analysis':
        tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
        model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
    elif opt == 'twitter-roberta-base-sentiment':
        tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
    elif opt == 'distilRoberta-financial-sentiment':
        tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
        model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
    
    elif not model and not tokenizer: 
        print("Model not loaded correctly. Try again!")

    return model, tokenizer

def sentiment_analysis(model, tokenizer):
    if model and tokenizer:
        return pipeline('text-classification', model=model, tokenizer=tokenizer)
    else: return pipeline('text-classification')

# Title the Streamlit app 'Finetuned Harvard USPTO Patent Dataset (using DistilBert-Base-Uncased)'
st.title('Finetuned Sentiment Analysis for US Patents')
st.markdown('Link to the app - [sentiment-analysis-app](https://huggingface.co/spaces/saccharinedreams/sentiment-analysis-app)')
st.markdown('Link to the model - [model repo](https://huggingface.co/saccharinedreams/finetuned-distilbert-base-uncased-for-hupd')
st.markdown('This model was finetuned on the Harvard USPTO Patent Dataset and uses Distilbert-Base-Uncased.')

abstracts = load_abstracts()
print(len(abstracts))
print(abstracts[0])
dropdown_abstracts = st.selectbox('Select one of the following abstracts from the HUPD dataset:', abstracts, index=abstracts.index(abstracts[0]))
model, tokenizer = load_model('finetuned')

# Take in user input
#user_text = st.text_input('Input text to perform sentiment analysis on here.', 'I love AI!')

# The user can interact with a dropdown menu to choose a sentiment analysis model.
#dropdown_value = st.selectbox('Select one of the following sentiment analysis models', MODEL_OPTS, index=MODEL_OPTS.index(DEFAULT_OPT))
#model, tokenizer = load_model(dropdown_value)


# Perform sentiment analysis on the user's input
result = sentiment_analysis(model, tokenizer)(dropdown_abstracts)

# Display the sentiment analysis results
st.markdown('Labels 0, 1: Not accepted, Accepted')
st.write('Sentiment:', result[0]['label'], '; Score:', result[0]['score'])