umangsoni commited on
Commit
f42a46b
Β·
1 Parent(s): 95eb783

add finetune.py

Browse files
Files changed (2) hide show
  1. app.py +50 -26
  2. finetune.py +25 -7
app.py CHANGED
@@ -1,33 +1,57 @@
 
1
  import streamlit as st
2
- from transformers import pipeline
 
 
3
 
4
  # title
5
- st.title("Sentiment Analysis - Extract sentiment from a given text")
6
 
7
  # subtitle
8
- st.markdown("## Sentiment Analysis App - Using `transformers`, `streamlit` - hosted on πŸ€— Spaces")
9
  st.markdown("")
10
 
11
- # text input
12
- user_input = st.text_area("Please enter a sentence for sentiment analysis", "I am so happy and excited!")
13
-
14
-
15
-
16
- @st.cache_resource
17
- def get_sentiment_model():
18
- return pipeline('sentiment-analysis')
19
-
20
-
21
- sentiment_model = get_sentiment_model() # load model
22
-
23
- if user_input is not None:
24
- with st.spinner("πŸ€– AI is at Work! "):
25
- result = sentiment_model(user_input)
26
- st.write(result)
27
- # st.success("Here you go!")
28
- st.balloons()
29
- else:
30
- st.write("Enter a sentence")
31
-
32
- st.caption("Made by @us87")
33
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
  import streamlit as st
3
+ from datasets import load_dataset
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+ import pandas as pd
6
 
7
  # title
8
+ st.title("Patentability Score - Extract sentiment from a given text")
9
 
10
  # subtitle
11
+ st.markdown("## Patentability Score - Finetuned on The Harvard USPTO Patent Dataset - hosted on πŸ€— Spaces")
12
  st.markdown("")
13
 
14
+ # Load trained model and tokenizer
15
+ model_path = "./results" # Replace with your model path
16
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
17
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
18
+
19
+ # Load USPTO dataset and extract unique patent IDs
20
+ patent_data = load_dataset(
21
+ 'HUPD/hupd',
22
+ name='sample',
23
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
24
+ icpr_label=None,
25
+ train_filing_start_date='2016-01-01',
26
+ train_filing_end_date='2016-01-31',
27
+ )
28
+
29
+ df = pd.DataFrame({
30
+ 'patent_number': patent_data['train']['patent_number'],
31
+ 'abstract': patent_data['train']['abstract'],
32
+ 'decision': patent_data['train']['decision'],
33
+ }).set_index('patent_number') # Create a dataframe with patent number as index
34
+
35
+ patent_ids = df.index.unique().tolist() # Get unique patent IDs
36
+
37
+ # Create a dropdown menu for patent IDs
38
+ patent_id = st.selectbox("Select Patent Application ID", patent_ids)
39
+
40
+ # Fetch and display abstract and claims for selected patent
41
+ abstract, claims = df.loc[patent_id, ['abstract', 'claims']] # Fetch abstract and claims for selected patent
42
+ st.text_area("Abstract:", value=abstract, height=200, max_chars=None, key=None)
43
+ st.text_area("Claims:", value=claims, height=200, max_chars=None, key=None)
44
+
45
+ if st.button("Submit"):
46
+ # Preprocess input
47
+ inputs = tokenizer(abstract + " " + claims, return_tensors="pt")
48
+
49
+ # Run model
50
+ outputs = model(**inputs)
51
+
52
+ # Process outputs
53
+ probs = outputs.logits.softmax(dim=1).detach().numpy()[0]
54
+ score = probs[1] # Probability of being "ACCEPTED"
55
+
56
+ # Display score
57
+ st.write(f"Patentability Score: {score}")
finetune.py CHANGED
@@ -2,6 +2,9 @@ from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassific
2
  from datasets import load_dataset
3
  import torch
4
  from torch.utils.data import Dataset
 
 
 
5
 
6
  # Check if we have a MPS compatible device
7
  mps_device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")
@@ -24,16 +27,31 @@ dataset_dict = load_dataset(
24
  val_filing_end_date='2016-01-31',
25
  )
26
 
27
- label_values = ['ACCEPTED', 'REJECTED', 'PENDING']
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- train_texts = dataset_dict['train']['abstract']
30
- train_labels = [label_values.index(decision)
31
- for decision in dataset_dict['train']['decision'] if decision in label_values]
 
 
32
 
33
- validation_texts = dataset_dict['validation']['abstract']
34
- validation_labels = [label_values.index(decision)
35
- for decision in dataset_dict['validation']['decision'] if decision in label_values]
 
36
 
 
37
 
38
  # Create a PyTorch Dataset
39
  class USPTODataset(Dataset):
 
2
  from datasets import load_dataset
3
  import torch
4
  from torch.utils.data import Dataset
5
+ from tqdm import tqdm
6
+ import pandas as pd
7
+
8
 
9
  # Check if we have a MPS compatible device
10
  mps_device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")
 
27
  val_filing_end_date='2016-01-31',
28
  )
29
 
30
+ label_values = ['REJECTED', 'ACCEPTED']
31
+
32
+ df = pd.DataFrame({
33
+ 'abstract': dataset_dict['train']['abstract'],
34
+ 'decision': dataset_dict['train']['decision']
35
+ })
36
+
37
+ # Filter out abstracts where decision is not in label_values
38
+ df = df[df['decision'].isin(label_values)]
39
+ df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0)
40
+ train_texts, train_labels = df['abstract'].tolist(), df['decision'].tolist()
41
+
42
 
43
+ # Do the same for the validation dataset
44
+ df = pd.DataFrame({
45
+ 'abstract': dataset_dict['validation']['abstract'],
46
+ 'decision': dataset_dict['validation']['decision']
47
+ })
48
 
49
+ # Filter out abstracts where decision is not in label_values
50
+ df = df[df['decision'].isin(label_values)]
51
+ df['decision'] = df['decision'].apply(lambda x: 1 if x == 'ACCEPTED' else 0)
52
+ validation_texts, validation_labels = df['abstract'].tolist(), df['decision'].tolist()
53
 
54
+ print("Number of training samples: {:,}\nNumber of validation samples: {:,}".format(len(train_texts), len(validation_texts)))
55
 
56
  # Create a PyTorch Dataset
57
  class USPTODataset(Dataset):