Spaces:
Sleeping
Sleeping
# Import necessary libraries | |
import streamlit as st | |
from datasets import load_dataset | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import pandas as pd | |
# title | |
st.title("Patentability Score - Extract sentiment from a given text") | |
# subtitle | |
st.markdown("## Patentability Score - Finetuned on The Harvard USPTO Patent Dataset - hosted on π€ Spaces") | |
st.markdown("") | |
# Load trained model and tokenizer | |
model_path = "umangsoni/distilbert-base-tuned" # Replace with your model path | |
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') | |
model = AutoModelForSequenceClassification.from_pretrained(model_path) | |
# Load USPTO dataset and extract unique patent IDs | |
patent_data = load_dataset( | |
'HUPD/hupd', | |
name='sample', | |
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", | |
icpr_label=None, | |
train_filing_start_date='2016-01-01', | |
train_filing_end_date='2016-01-21', | |
val_filing_start_date='2016-01-22', | |
val_filing_end_date='2016-01-31', | |
) | |
df = pd.DataFrame({ | |
'patent_number': patent_data['train']['patent_number'], | |
'abstract': patent_data['train']['abstract'], | |
'claims': patent_data['train']['claims'], | |
'decision': patent_data['train']['decision'], | |
}).set_index('patent_number') # Create a dataframe with patent number as index | |
patent_ids = df.index.unique().tolist() # Get unique patent IDs | |
# Create a dropdown menu for patent IDs | |
patent_id = st.selectbox("Select Patent Application ID", patent_ids) | |
# Fetch and display abstract and claims for selected patent | |
abstract, claims = df.loc[patent_id, ['abstract', 'claims']] # Fetch abstract and claims for selected patent | |
# abstract, claims = abstract['abstract'], claims['claims'] | |
st.text_area("Abstract:", value=abstract, height=200, max_chars=None, key=None) | |
st.text_area("Claims:", value=claims, height=200, max_chars=None, key=None) | |
if st.button("Submit"): | |
# Preprocess input | |
inputs = tokenizer(abstract + " " + claims, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
# Run model | |
outputs = model(**inputs) | |
# Process outputs | |
probs = outputs.logits.softmax(dim=1).detach().numpy()[0] | |
score = probs[1] # Probability of being "ACCEPTED" | |
# Display score | |
st.write(f"Patentability Score: {score}") | |