dbleek commited on
Commit
24c49f4
1 Parent(s): d86b016

removed old classifier

Browse files
milestone-3.py CHANGED
@@ -1,67 +1,77 @@
1
  import streamlit as st
2
  import torch
3
  from datasets import load_dataset
4
- from transformers import AutoTokenizer
5
  from transformers import AutoModelForSequenceClassification
6
  from transformers import pipeline
7
 
8
  # Load HUPD dataset
9
- dataset_dict = load_dataset('HUPD/hupd',
10
- name='sample',
11
- data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
 
12
  icpr_label=None,
13
- train_filing_start_date='2016-01-01',
14
- train_filing_end_date='2016-01-21',
15
- val_filing_start_date='2016-01-22',
16
- val_filing_end_date='2016-01-31',
17
  )
18
 
19
  # Process data
20
- filtered_dataset = dataset_dict['validation'].filter(lambda e: e['decision'] == 'ACCEPTED' or e['decision'] == 'REJECTED')
 
 
21
  dataset = filtered_dataset.shuffle(seed=42).select(range(20))
22
  dataset = dataset.sort("patent_number")
23
 
24
  # Create pipeline using model trainned on Colab
25
- model = torch.load("patent_classifier.pt", map_location=torch.device('cpu'))
26
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
27
  classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
28
 
 
29
  def load_patent():
30
  selected_application = dataset.select([applications[st.session_state.id]])
31
- st.session_state.abstract = selected_application['abstract'][0]
32
- st.session_state.claims = selected_application['claims'][0]
33
- st.session_state.title = selected_application['title'][0]
 
34
 
35
  st.title("CS-GY-6613 Project Milestone 3")
36
 
37
  # List patent numbers for select box
38
  applications = {}
39
  for ds_index, example in enumerate(dataset):
40
- applications.update({example['patent_number']: ds_index })
41
- st.selectbox("Select a patent application:", applications, on_change=load_patent, key="id")
 
 
42
 
43
  # Application title displayed for additional context only, not used with model
44
- st.text_area("Title", key="title", value=dataset[0]['title'], height=50)
45
 
46
  # Classifier input form
47
- with st.form('Input Form'):
48
- abstract = st.text_area("Abstract", key="abstract", value=dataset[0]['abstract'], height=200)
49
- claims = st.text_area("Claims", key="claims", value=dataset[0]['abstract'], height=200)
 
 
 
 
50
  submitted = st.form_submit_button("Get Patentability Score")
51
 
52
  if submitted:
53
  selected_application = dataset.select([applications[st.session_state.id]])
54
  res = classifier(abstract, claims)
55
- if res[0]["label"] == 'LABEL_0':
56
  pred = "ACCEPTED"
57
- elif res[0]["label"] == 'LABEL_1':
58
  pred = "REJECTED"
59
  score = res[0]["score"]
60
- label = selected_application['decision'][0]
61
- result = st.markdown("This text was classified as **{}** with a confidence score of **{}**.".format(pred, score))
 
 
 
 
62
  check = st.markdown("Actual Label: **{}**.".format(label))
63
-
64
-
65
-
66
-
67
-
 
1
  import streamlit as st
2
  import torch
3
  from datasets import load_dataset
4
+ from transformers import AutoTokenizer
5
  from transformers import AutoModelForSequenceClassification
6
  from transformers import pipeline
7
 
8
  # Load HUPD dataset
9
+ dataset_dict = load_dataset(
10
+ "HUPD/hupd",
11
+ name="sample",
12
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
13
  icpr_label=None,
14
+ train_filing_start_date="2016-01-01",
15
+ train_filing_end_date="2016-01-21",
16
+ val_filing_start_date="2016-01-22",
17
+ val_filing_end_date="2016-01-31",
18
  )
19
 
20
  # Process data
21
+ filtered_dataset = dataset_dict["validation"].filter(
22
+ lambda e: e["decision"] == "ACCEPTED" or e["decision"] == "REJECTED"
23
+ )
24
  dataset = filtered_dataset.shuffle(seed=42).select(range(20))
25
  dataset = dataset.sort("patent_number")
26
 
27
  # Create pipeline using model trainned on Colab
28
+ model = torch.load("patent_classifier.pt", map_location=torch.device("cpu"))
29
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
30
  classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
31
 
32
+
33
  def load_patent():
34
  selected_application = dataset.select([applications[st.session_state.id]])
35
+ st.session_state.abstract = selected_application["abstract"][0]
36
+ st.session_state.claims = selected_application["claims"][0]
37
+ st.session_state.title = selected_application["title"][0]
38
+
39
 
40
  st.title("CS-GY-6613 Project Milestone 3")
41
 
42
  # List patent numbers for select box
43
  applications = {}
44
  for ds_index, example in enumerate(dataset):
45
+ applications.update({example["patent_number"]: ds_index})
46
+ st.selectbox(
47
+ "Select a patent application:", applications, on_change=load_patent, key="id"
48
+ )
49
 
50
  # Application title displayed for additional context only, not used with model
51
+ st.text_area("Title", key="title", value=dataset[0]["title"], height=50)
52
 
53
  # Classifier input form
54
+ with st.form("Input Form"):
55
+ abstract = st.text_area(
56
+ "Abstract", key="abstract", value=dataset[0]["abstract"], height=200
57
+ )
58
+ claims = st.text_area(
59
+ "Claims", key="claims", value=dataset[0]["abstract"], height=200
60
+ )
61
  submitted = st.form_submit_button("Get Patentability Score")
62
 
63
  if submitted:
64
  selected_application = dataset.select([applications[st.session_state.id]])
65
  res = classifier(abstract, claims)
66
+ if res[0]["label"] == "LABEL_0":
67
  pred = "ACCEPTED"
68
+ elif res[0]["label"] == "LABEL_1":
69
  pred = "REJECTED"
70
  score = res[0]["score"]
71
+ label = selected_application["decision"][0]
72
+ result = st.markdown(
73
+ "This text was classified as **{}** with a confidence score of **{}**.".format(
74
+ pred, score
75
+ )
76
+ )
77
  check = st.markdown("Actual Label: **{}**.".format(label))
 
 
 
 
 
patent_classifier_v2.pt → patent_classifier.pt RENAMED
File without changes