# Import statements import streamlit as st from transformers import pipeline from transformers import AutoTokenizer, AutoModelForSequenceClassification import numpy as np from datasets import load_dataset # Torch and torch dataloader import torch from torch.utils.data import DataLoader st.title('Patentability Decision App') # Input all validation patent files dataset_dict = load_dataset('HUPD/hupd', name='sample', data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date='2016-01-01', train_filing_end_date='2016-01-21', val_filing_start_date='2016-01-22', val_filing_end_date='2016-01-31', ) # Remove all untrained decisions # Label-to-index mapping for the decision status field decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5} # Helper function def map_decision_to_string(example): return {'decision': decision_to_str[example['decision']]} # Re-labeling/mapping in validation set val_set = dataset_dict['validation'].map(map_decision_to_string) # Filtering only those patents that have decisions as accepted/rejected val_set = val_set.filter(lambda e: e['decision'] <= 1) # Display all patent numbers to select a file patent_num = st.selectbox("Select a patent based on its number", val_set['patent_number']) # Keeping the session state if "button_clicked" not in st.session_state: st.session_state.button_clicked = False # The button was clicked def callback(): st.session_state.button_clicked = True # Get the abstract and claims data to predict if patent_num and (st.button('Get Data to predict!', on_click = callback) or st.session_state.button_clicked): # Display the abstract and claims val_set = val_set.filter(lambda e: e['patent_number'] == patent_num) abstract_text = st.text_area('Abstract', val_set['abstract'][0]) claims_text = st.text_area('Claims', val_set['claims'][0]) # Predict on those texts if abstract_text and claims_text and st.button('Predict!'): # Model/tokenizer name or path to finetuned model model_name_or_path = './models/' model_name = 'distilbert-base-uncased' # Tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # Model model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path) # Tokenize the validation dataset and pass it to the model for prediction _SECTION_ = 'claims' val_set = val_set.map(lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),batched=True) val_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decision']) # Creating a dataloader and only passing one row val_dataloader = DataLoader(val_set, batch_size=16) batch = next(iter(val_dataloader)) inputs = (batch['input_ids'][0]) decisions = (batch['decision'][0]) # Predict with torch.no_grad(): outputs = model(input_ids=inputs, labels=decisions).logits # Display prediction prediction = np.argmax(outputs, axis=-1).stride()[0] # prediction value = {i for i in decision_to_str if decision_to_str[i]==prediction} st.text('This is the predicted decision: ' + str(value)) # Patentability score st.text('Probability that it will be rejected : ' + str(outputs[0][0].item() * 100)) st.text('Probability that it will be accepted : ' + str(outputs[0][1].item() * 100))