import streamlit as st import pandas as pd from transformers import pipeline from pprint import pprint from datasets import load_dataset from torch.utils.data import DataLoader st.title("CS634 - milestone2 - Tedi Pano") @st.cache_resource def load_data(): dataset_dict = load_dataset('HUPD/hupd', name='sample', data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date='2016-01-01', train_filing_end_date='2016-01-21', val_filing_start_date='2016-01-22', val_filing_end_date='2016-01-31', ) st.write('Loading is done!') return dataset_dict @st.cache_resource def training_computation(_dataset_dict): df = pd.DataFrame(_dataset_dict['train']) vf = pd.DataFrame(_dataset_dict['validation']) accepted_rejected = ['ACCEPTED', 'REJECTED'] df = df[df['decision'].isin(accepted_rejected)] df['patentability_score'] = df['decision'].map({'ACCEPTED': 1, 'REJECTED': 0}) vf = vf[vf['decision'].isin(accepted_rejected)] vf['patentability_score'] = vf['decision'].map({'ACCEPTED': 1, 'REJECTED': 0}) st.write("Processed the data") from sklearn.model_selection import train_test_split dftrain, dftest = train_test_split(df, test_size = 0.90, random_state = 0) from transformers import DistilBertTokenizerFast tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') X_dtrain = dftrain['abstract'].tolist() y_dtrain = dftrain['patentability_score'].tolist() X_vtrain = vf['abstract'].tolist() y_vtrain = vf['patentability_score'].tolist() X_dtest = dftest['abstract'].tolist() y_dtest = dftest['patentability_score'].tolist() train_encodings = tokenizer(X_dtrain, truncation=True, padding=True) val_encodings = tokenizer(X_vtrain, truncation=True, padding=True) test_encodings = tokenizer(X_dtest, truncation=True, padding=True) st.write("tokenizing completed!") import tensorflow as tf train_dataset = tf.data.Dataset.from_tensor_slices(( dict(train_encodings), y_dtrain )) val_dataset = tf.data.Dataset.from_tensor_slices(( dict(val_encodings), y_vtrain )) test_dataset = tf.data.Dataset.from_tensor_slices(( dict(test_encodings), y_dtest )) st.write("back to dataset!") from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments training_args = TFTrainingArguments( output_dir='./results', num_train_epochs=2, per_device_train_batch_size=16, per_device_eval_batch_size=16, warmup_steps=500, eval_steps=500, weight_decay=0.01 ) with training_args.strategy.scope(): model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) trainer.train() st.write("training completed") return trainer dataset_dict = load_data() trainer = training_computation(dataset_dict) patents = pd.DataFrame(dataset_dict['train']) patent_selection = st.selectbox("Select Patent",patents['patent_number']) patent = patents.loc[patents['patent_number'] == patent_selection] st.write(patent['abstract']) st.write(patent['claims'])