File size: 3,668 Bytes
1ca0686
d042411
1ca0686
d042411
 
 
 
 
3112e6e
1ca0686
d042411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ca0686
d042411
1ca0686
d042411
 
3112e6e
d042411
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
import pandas as pd
from transformers import pipeline

from pprint import pprint
from datasets import load_dataset
from torch.utils.data import DataLoader

st.title("CS634 - milestone2 - Tedi Pano")

@st.cache_resource
def load_data():
    dataset_dict = load_dataset('HUPD/hupd',
        name='sample',
        data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
        icpr_label=None,
        train_filing_start_date='2016-01-01',
        train_filing_end_date='2016-01-21',
        val_filing_start_date='2016-01-22',
        val_filing_end_date='2016-01-31',
    )

    st.write('Loading is done!')
    return dataset_dict

@st.cache_resource
def training_computation(_dataset_dict):
    df = pd.DataFrame(_dataset_dict['train'])
    vf = pd.DataFrame(_dataset_dict['validation'])
    
    accepted_rejected = ['ACCEPTED', 'REJECTED']
    df = df[df['decision'].isin(accepted_rejected)]
    df['patentability_score'] = df['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
    vf = vf[vf['decision'].isin(accepted_rejected)]
    vf['patentability_score'] = vf['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
    
    st.write("Processed the data")
    
    
    from sklearn.model_selection import train_test_split
    dftrain, dftest = train_test_split(df, test_size = 0.90, random_state = 0)
    
    from transformers import DistilBertTokenizerFast
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    
    X_dtrain = dftrain['abstract'].tolist()
    y_dtrain = dftrain['patentability_score'].tolist()
    
    X_vtrain = vf['abstract'].tolist()
    y_vtrain = vf['patentability_score'].tolist()
    
    X_dtest = dftest['abstract'].tolist()
    y_dtest = dftest['patentability_score'].tolist()
    
    train_encodings = tokenizer(X_dtrain, truncation=True, padding=True)
    val_encodings = tokenizer(X_vtrain, truncation=True, padding=True)
    test_encodings = tokenizer(X_dtest, truncation=True, padding=True)
    
    st.write("tokenizing completed!")
    
    import tensorflow as tf
    
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        y_dtrain
    ))
    
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        y_vtrain
    ))
    
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        y_dtest
    ))
    
    st.write("back to dataset!")
    
    from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
    
    training_args = TFTrainingArguments(
        output_dir='./results',          
        num_train_epochs=2,              
        per_device_train_batch_size=16, 
        per_device_eval_batch_size=16,   
        warmup_steps=500,
        eval_steps=500, 
        weight_decay=0.01               
    )
    
    
    with training_args.strategy.scope():
        model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
    
    trainer = TFTrainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset             
    )

    trainer.train()

    st.write("training completed")
    return trainer
    

dataset_dict = load_data()
trainer = training_computation(dataset_dict)


patents = pd.DataFrame(dataset_dict['train'])
patent_selection = st.selectbox("Select Patent",patents['patent_number'])

patent = patents.loc[patents['patent_number'] == patent_selection]
st.write(patent['abstract'])
st.write(patent['claims'])