File size: 2,345 Bytes
f42a46b
23db584
f42a46b
 
 
9cddcfa
23db584
f42a46b
9cddcfa
23db584
f42a46b
9cddcfa
 
f42a46b
c980f44
baa3703
 
f42a46b
 
 
 
 
 
 
 
922fe06
 
 
f42a46b
 
 
 
 
477ef11
f42a46b
 
 
 
 
 
 
 
 
de1fe39
 
f42a46b
 
 
 
 
8ff154b
f42a46b
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Import necessary libraries
import streamlit as st
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

# title
st.title("Patentability Score - Extract sentiment from a given text")

# subtitle
st.markdown("## Patentability Score - Finetuned on The Harvard USPTO Patent Dataset -  hosted on 🤗 Spaces")
st.markdown("")

# Load trained model and tokenizer
model_path = "umangsoni/distilbert-base-tuned"  # Replace with your model path
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Load USPTO dataset and extract unique patent IDs
patent_data = load_dataset(
    'HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)

df = pd.DataFrame({
    'patent_number': patent_data['train']['patent_number'],
    'abstract': patent_data['train']['abstract'],
    'claims': patent_data['train']['claims'],
    'decision': patent_data['train']['decision'],
}).set_index('patent_number')  # Create a dataframe with patent number as index

patent_ids = df.index.unique().tolist()  # Get unique patent IDs

# Create a dropdown menu for patent IDs
patent_id = st.selectbox("Select Patent Application ID", patent_ids)

# Fetch and display abstract and claims for selected patent
abstract, claims = df.loc[patent_id, ['abstract', 'claims']]  # Fetch abstract and claims for selected patent
# abstract, claims = abstract['abstract'], claims['claims']
st.text_area("Abstract:", value=abstract, height=200, max_chars=None, key=None)
st.text_area("Claims:", value=claims, height=200, max_chars=None, key=None)

if st.button("Submit"):
    # Preprocess input
    inputs = tokenizer(abstract + " " + claims, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Run model
    outputs = model(**inputs)

    # Process outputs
    probs = outputs.logits.softmax(dim=1).detach().numpy()[0]
    score = probs[1]  # Probability of being "ACCEPTED"

    # Display score
    st.write(f"Patentability Score: {score}")