# Import necessary libraries import streamlit as st from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification import pandas as pd # title st.title("Patentability Score - Extract sentiment from a given text") # subtitle st.markdown("## Patentability Score - Finetuned on The Harvard USPTO Patent Dataset - hosted on 🤗 Spaces") st.markdown("") # Load trained model and tokenizer model_path = "umangsoni/distilbert-base-tuned" # Replace with your model path tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') model = AutoModelForSequenceClassification.from_pretrained(model_path) # Load USPTO dataset and extract unique patent IDs patent_data = load_dataset( 'HUPD/hupd', name='sample', data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date='2016-01-01', train_filing_end_date='2016-01-21', val_filing_start_date='2016-01-22', val_filing_end_date='2016-01-31', ) df = pd.DataFrame({ 'patent_number': patent_data['train']['patent_number'], 'abstract': patent_data['train']['abstract'], 'claims': patent_data['train']['claims'], 'decision': patent_data['train']['decision'], }).set_index('patent_number') # Create a dataframe with patent number as index patent_ids = df.index.unique().tolist() # Get unique patent IDs # Create a dropdown menu for patent IDs patent_id = st.selectbox("Select Patent Application ID", patent_ids) # Fetch and display abstract and claims for selected patent abstract, claims = df.loc[patent_id, ['abstract', 'claims']] # Fetch abstract and claims for selected patent # abstract, claims = abstract['abstract'], claims['claims'] st.text_area("Abstract:", value=abstract, height=200, max_chars=None, key=None) st.text_area("Claims:", value=claims, height=200, max_chars=None, key=None) if st.button("Submit"): # Preprocess input inputs = tokenizer(abstract + " " + claims, return_tensors="pt", padding=True, truncation=True, max_length=512) # Run model outputs = model(**inputs) # Process outputs probs = outputs.logits.softmax(dim=1).detach().numpy()[0] score = probs[1] # Probability of being "ACCEPTED" # Display score st.write(f"Patentability Score: {score}")