dbleek commited on
Commit
4d1c892
2 Parent(s): efd1c85 8114970

Merge pull request #2 from dbleek/milestone-3

Browse files
Files changed (6) hide show
  1. .gitattributes +34 -0
  2. README.md +6 -4
  3. milestone-2.py +26 -0
  4. milestone-3.py +77 -0
  5. patent_classifier.pt +3 -0
  6. requirements.txt +32 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,18 +1,20 @@
1
  ---
2
- title: CS-GY-6613 Project Milestone 2
3
  colorFrom: blue
4
  colorTo: red
5
  sdk: streamlit
6
- app_file: milestone_2.py
7
  pinned: false
8
  ---
9
 
10
  # cs-gy-6613-project
11
  Project for CS-GY-6613 Spring 2023
12
 
13
- # Milestone 2
 
14
 
15
- Sentiment Analysis App: https://huggingface.co/spaces/dbleek/cs-gy-6613-project
 
16
 
17
  # Milestone 1
18
  For milestone 1, I used the quick start instructions from VS code to connect to a remote Ubuntu container:
 
1
  ---
2
+ title: CS-GY-6613 Project Milestone 3
3
  colorFrom: blue
4
  colorTo: red
5
  sdk: streamlit
6
+ app_file: milestone-3.py
7
  pinned: false
8
  ---
9
 
10
  # cs-gy-6613-project
11
  Project for CS-GY-6613 Spring 2023
12
 
13
+ # Milestone 3
14
+ USPTO Patentability Classifier:https://huggingface.co/spaces/dbleek/cs-gy-6613-project
15
 
16
+ # Milestone 2
17
+ Sentiment Analysis App:https://huggingface.co/spaces/dbleek/cs-gy-6613-project
18
 
19
  # Milestone 1
20
  For milestone 1, I used the quick start instructions from VS code to connect to a remote Ubuntu container:
milestone-2.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import (AutoTokenizer, TFAutoModelForSequenceClassification,
3
+ pipeline)
4
+
5
+ st.title("CS-GY-6613 Project Milestone 2")
6
+ model_choices = (
7
+ "distilbert-base-uncased-finetuned-sst-2-english",
8
+ "j-hartmann/emotion-english-distilroberta-base",
9
+ "joeddav/distilbert-base-uncased-go-emotions-student",
10
+ )
11
+
12
+ with st.form("Input Form"):
13
+ text = st.text_area("Write your text here:", "CS-GY-6613 is a great course!")
14
+ model_name = st.selectbox("Select a model:", model_choices)
15
+ submitted = st.form_submit_button("Submit")
16
+
17
+ if submitted:
18
+ model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
21
+ res = classifier(text)
22
+ label = res[0]["label"].upper()
23
+ score = res[0]["score"]
24
+ st.markdown(
25
+ f"This text was classified as **{label}** with a confidence score of **{score}**."
26
+ )
milestone-3.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from datasets import load_dataset
4
+ from transformers import AutoTokenizer
5
+ from transformers import AutoModelForSequenceClassification
6
+ from transformers import pipeline
7
+
8
+ # Load HUPD dataset
9
+ dataset_dict = load_dataset(
10
+ "HUPD/hupd",
11
+ name="sample",
12
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
13
+ icpr_label=None,
14
+ train_filing_start_date="2016-01-01",
15
+ train_filing_end_date="2016-01-21",
16
+ val_filing_start_date="2016-01-22",
17
+ val_filing_end_date="2016-01-31",
18
+ )
19
+
20
+ # Process data
21
+ filtered_dataset = dataset_dict["validation"].filter(
22
+ lambda e: e["decision"] == "ACCEPTED" or e["decision"] == "REJECTED"
23
+ )
24
+ dataset = filtered_dataset.shuffle(seed=42).select(range(20))
25
+ dataset = dataset.sort("patent_number")
26
+
27
+ # Create pipeline using model trainned on Colab
28
+ model = torch.load("patent_classifier.pt", map_location=torch.device("cpu"))
29
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
30
+ classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
31
+
32
+
33
+ def load_patent():
34
+ selected_application = dataset.select([applications[st.session_state.id]])
35
+ st.session_state.abstract = selected_application["abstract"][0]
36
+ st.session_state.claims = selected_application["claims"][0]
37
+ st.session_state.title = selected_application["title"][0]
38
+
39
+
40
+ st.title("CS-GY-6613 Project Milestone 3")
41
+
42
+ # List patent numbers for select box
43
+ applications = {}
44
+ for ds_index, example in enumerate(dataset):
45
+ applications.update({example["patent_number"]: ds_index})
46
+ st.selectbox(
47
+ "Select a patent application:", applications, on_change=load_patent, key="id"
48
+ )
49
+
50
+ # Application title displayed for additional context only, not used with model
51
+ st.text_area("Title", key="title", value=dataset[0]["title"], height=50)
52
+
53
+ # Classifier input form
54
+ with st.form("Input Form"):
55
+ abstract = st.text_area(
56
+ "Abstract", key="abstract", value=dataset[0]["abstract"], height=200
57
+ )
58
+ claims = st.text_area(
59
+ "Claims", key="claims", value=dataset[0]["abstract"], height=200
60
+ )
61
+ submitted = st.form_submit_button("Get Patentability Score")
62
+
63
+ if submitted:
64
+ selected_application = dataset.select([applications[st.session_state.id]])
65
+ res = classifier(abstract, claims)
66
+ if res[0]["label"] == "LABEL_0":
67
+ pred = "ACCEPTED"
68
+ elif res[0]["label"] == "LABEL_1":
69
+ pred = "REJECTED"
70
+ score = res[0]["score"]
71
+ label = selected_application["decision"][0]
72
+ result = st.markdown(
73
+ "This text was classified as **{}** with a confidence score of **{}**.".format(
74
+ pred, score
75
+ )
76
+ )
77
+ check = st.markdown("Actual Label: **{}**.".format(label))
patent_classifier.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fbbdc470f673703431aa31cc7451af0d0608df3bd6e7006ab32866803f4eece
3
+ size 267882633
requirements.txt CHANGED
@@ -1,16 +1,24 @@
1
  absl-py==1.4.0
 
 
2
  altair==4.2.2
3
  astunparse==1.6.3
 
4
  attrs==22.2.0
5
  blinker==1.6.1
6
  cachetools==5.3.0
7
  certifi==2022.12.7
8
  charset-normalizer==3.1.0
9
  click==8.1.3
 
 
10
  decorator==5.1.1
 
11
  entrypoints==0.4
12
  filelock==3.10.7
13
  flatbuffers==23.3.3
 
 
14
  gast==0.4.0
15
  gitdb==4.0.10
16
  GitPython==3.1.31
@@ -28,12 +36,28 @@ Jinja2==3.1.2
28
  jsonschema==4.17.3
29
  keras==2.12.0
30
  libclang==16.0.0
 
31
  Markdown==3.4.3
32
  markdown-it-py==2.2.0
33
  MarkupSafe==2.1.2
34
  mdurl==0.1.2
35
  ml-dtypes==0.0.4
 
 
 
 
36
  numpy==1.23.5
 
 
 
 
 
 
 
 
 
 
 
37
  oauthlib==3.2.2
38
  opt-einsum==3.3.0
39
  packaging==23.0
@@ -56,12 +80,14 @@ PyYAML==6.0
56
  regex==2023.3.23
57
  requests==2.28.2
58
  requests-oauthlib==1.3.1
 
59
  rich==13.3.3
60
  rsa==4.9
61
  scipy==1.10.1
62
  six==1.16.0
63
  smmap==5.0.0
64
  streamlit==1.21.0
 
65
  tensorboard==2.12.1
66
  tensorboard-data-server==0.7.0
67
  tensorboard-plugin-wit==1.8.1
@@ -72,9 +98,13 @@ termcolor==2.2.0
72
  tokenizers==0.13.3
73
  toml==0.10.2
74
  toolz==0.12.0
 
 
 
75
  tornado==6.2
76
  tqdm==4.65.0
77
  transformers==4.27.4
 
78
  typing_extensions==4.5.0
79
  tzdata==2023.3
80
  tzlocal==4.3
@@ -84,4 +114,6 @@ watchdog==3.0.0
84
  wcwidth==0.2.6
85
  Werkzeug==2.2.3
86
  wrapt==1.14.1
 
 
87
  zipp==3.15.0
 
1
  absl-py==1.4.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
  altair==4.2.2
5
  astunparse==1.6.3
6
+ async-timeout==4.0.2
7
  attrs==22.2.0
8
  blinker==1.6.1
9
  cachetools==5.3.0
10
  certifi==2022.12.7
11
  charset-normalizer==3.1.0
12
  click==8.1.3
13
+ cmake==3.26.3
14
+ datasets==2.12.0
15
  decorator==5.1.1
16
+ dill==0.3.6
17
  entrypoints==0.4
18
  filelock==3.10.7
19
  flatbuffers==23.3.3
20
+ frozenlist==1.3.3
21
+ fsspec==2023.4.0
22
  gast==0.4.0
23
  gitdb==4.0.10
24
  GitPython==3.1.31
 
36
  jsonschema==4.17.3
37
  keras==2.12.0
38
  libclang==16.0.0
39
+ lit==16.0.2
40
  Markdown==3.4.3
41
  markdown-it-py==2.2.0
42
  MarkupSafe==2.1.2
43
  mdurl==0.1.2
44
  ml-dtypes==0.0.4
45
+ mpmath==1.3.0
46
+ multidict==6.0.4
47
+ multiprocess==0.70.14
48
+ networkx==3.1
49
  numpy==1.23.5
50
+ nvidia-cublas-cu11==11.10.3.66
51
+ nvidia-cuda-cupti-cu11==11.7.101
52
+ nvidia-cuda-nvrtc-cu11==11.7.99
53
+ nvidia-cuda-runtime-cu11==11.7.99
54
+ nvidia-cudnn-cu11==8.5.0.96
55
+ nvidia-cufft-cu11==10.9.0.58
56
+ nvidia-curand-cu11==10.2.10.91
57
+ nvidia-cusolver-cu11==11.4.0.1
58
+ nvidia-cusparse-cu11==11.7.4.91
59
+ nvidia-nccl-cu11==2.14.3
60
+ nvidia-nvtx-cu11==11.7.91
61
  oauthlib==3.2.2
62
  opt-einsum==3.3.0
63
  packaging==23.0
 
80
  regex==2023.3.23
81
  requests==2.28.2
82
  requests-oauthlib==1.3.1
83
+ responses==0.18.0
84
  rich==13.3.3
85
  rsa==4.9
86
  scipy==1.10.1
87
  six==1.16.0
88
  smmap==5.0.0
89
  streamlit==1.21.0
90
+ sympy==1.11.1
91
  tensorboard==2.12.1
92
  tensorboard-data-server==0.7.0
93
  tensorboard-plugin-wit==1.8.1
 
98
  tokenizers==0.13.3
99
  toml==0.10.2
100
  toolz==0.12.0
101
+ torch==2.0.0
102
+ torchaudio==2.0.1
103
+ torchvision==0.15.1
104
  tornado==6.2
105
  tqdm==4.65.0
106
  transformers==4.27.4
107
+ triton==2.0.0
108
  typing_extensions==4.5.0
109
  tzdata==2023.3
110
  tzlocal==4.3
 
114
  wcwidth==0.2.6
115
  Werkzeug==2.2.3
116
  wrapt==1.14.1
117
+ xxhash==3.2.0
118
+ yarl==1.9.2
119
  zipp==3.15.0