import streamlit as st import pandas as pd from transformers import pipeline from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments from sklearn.model_selection import train_test_split from transformers import DistilBertTokenizerFast from pprint import pprint from datasets import load_dataset import tensorflow as tf st.title("CS634 - milestone3/4 - Tedi Pano") @st.cache_resource def load_data(): dataset_dict = load_dataset('HUPD/hupd', name='sample', data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date='2016-01-01', train_filing_end_date='2016-01-21', val_filing_start_date='2016-01-22', val_filing_end_date='2016-01-31', ) st.write('Loading is done!') return dataset_dict @st.cache_resource def training_computation(_dataset_dict): df = pd.DataFrame(_dataset_dict['train']) vf = pd.DataFrame(_dataset_dict['validation']) accepted_rejected = ['ACCEPTED', 'REJECTED'] df = df[df['decision'].isin(accepted_rejected)] df['patentability_score'] = df['decision'].map({'ACCEPTED': 1, 'REJECTED': 0}) vf = vf[vf['decision'].isin(accepted_rejected)] vf['patentability_score'] = vf['decision'].map({'ACCEPTED': 1, 'REJECTED': 0}) st.write("Processed the data") dftrain, dftest = train_test_split(df, test_size = 0.99, random_state = None) vftrain, vftest = train_test_split(df, test_size = 0.99, random_state = None) #st.write(dftrain.shape[0]) #st.write(vftrain.shape[0]) tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') X_dtrain = dftrain['abstract'].tolist() y_dtrain = dftrain['patentability_score'].tolist() X_vtrain = vftrain['abstract'].tolist() y_vtrain = vftrain['patentability_score'].tolist() X_dtest = dftest['abstract'].tolist() y_dtest = dftest['patentability_score'].tolist() train_encodings = tokenizer(X_dtrain, truncation=True, padding=True) val_encodings = tokenizer(X_vtrain, truncation=True, padding=True) test_encodings = tokenizer(X_dtest, truncation=True, padding=True) st.write("tokenizing completed!") train_dataset = tf.data.Dataset.from_tensor_slices(( dict(train_encodings), y_dtrain )) val_dataset = tf.data.Dataset.from_tensor_slices(( dict(val_encodings), y_vtrain )) test_dataset = tf.data.Dataset.from_tensor_slices(( dict(test_encodings), y_dtest )) #st.write("back to dataset!") training_args = TFTrainingArguments( output_dir='./results', num_train_epochs=1, per_device_train_batch_size=8, per_device_eval_batch_size=16, warmup_steps=5, eval_steps=5 ) with training_args.strategy.scope(): model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) st.write("training in progress.....") trainer.train() st.write("training completed") return trainer dataset_dict = load_data() trainer = training_computation(dataset_dict) patents = pd.DataFrame(dataset_dict['train']) accepted_rejected = ['ACCEPTED', 'REJECTED'] patents = patents[patents['decision'].isin(accepted_rejected)] patents['patentability_score'] = patents['decision'].map({'ACCEPTED': 1, 'REJECTED': 0}) patent_selection = st.selectbox("Select Patent",patents['patent_number']) patent = patents.loc[patents['patent_number'] == patent_selection] #st.write(patent.shape[0]) st.write(patent['abstract']) st.write(patent['claims']) with st.form("my_form"): submitted = st.form_submit_button("Submit") pat_abstract = patent['abstract'].tolist() pat_score = patent['patentability_score'].tolist() tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') test_encodings = tokenizer(pat_abstract, truncation=True, padding=True) test_dataset = tf.data.Dataset.from_tensor_slices(( dict(test_encodings), pat_score )) predictions = trainer.predict(test_dataset) if submitted: if(predictions[1][0] == 1): st.write("Patent is ACCEPTED") st.write("with a certainty of " + str(predictions[0][0][1])) else: st.write("Patent is REJECTED") st.write("with a certainty of " + str(predictions[0][0][1]))