Spaces:
Sleeping
Sleeping
File size: 2,345 Bytes
f42a46b 23db584 f42a46b 9cddcfa 23db584 f42a46b 9cddcfa 23db584 f42a46b 9cddcfa f42a46b c980f44 baa3703 f42a46b 922fe06 f42a46b 477ef11 f42a46b de1fe39 f42a46b 8ff154b f42a46b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# Import necessary libraries
import streamlit as st
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
# title
st.title("Patentability Score - Extract sentiment from a given text")
# subtitle
st.markdown("## Patentability Score - Finetuned on The Harvard USPTO Patent Dataset - hosted on 🤗 Spaces")
st.markdown("")
# Load trained model and tokenizer
model_path = "umangsoni/distilbert-base-tuned" # Replace with your model path
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained(model_path)
# Load USPTO dataset and extract unique patent IDs
patent_data = load_dataset(
'HUPD/hupd',
name='sample',
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
icpr_label=None,
train_filing_start_date='2016-01-01',
train_filing_end_date='2016-01-21',
val_filing_start_date='2016-01-22',
val_filing_end_date='2016-01-31',
)
df = pd.DataFrame({
'patent_number': patent_data['train']['patent_number'],
'abstract': patent_data['train']['abstract'],
'claims': patent_data['train']['claims'],
'decision': patent_data['train']['decision'],
}).set_index('patent_number') # Create a dataframe with patent number as index
patent_ids = df.index.unique().tolist() # Get unique patent IDs
# Create a dropdown menu for patent IDs
patent_id = st.selectbox("Select Patent Application ID", patent_ids)
# Fetch and display abstract and claims for selected patent
abstract, claims = df.loc[patent_id, ['abstract', 'claims']] # Fetch abstract and claims for selected patent
# abstract, claims = abstract['abstract'], claims['claims']
st.text_area("Abstract:", value=abstract, height=200, max_chars=None, key=None)
st.text_area("Claims:", value=claims, height=200, max_chars=None, key=None)
if st.button("Submit"):
# Preprocess input
inputs = tokenizer(abstract + " " + claims, return_tensors="pt", padding=True, truncation=True, max_length=512)
# Run model
outputs = model(**inputs)
# Process outputs
probs = outputs.logits.softmax(dim=1).detach().numpy()[0]
score = probs[1] # Probability of being "ACCEPTED"
# Display score
st.write(f"Patentability Score: {score}")
|