bokey commited on
Commit
12033a4
1 Parent(s): 20ee7e8

hf SPACES TEST

Browse files
Files changed (3) hide show
  1. app.py +343 -0
  2. requirements.txt +11 -0
  3. run.py +39 -0
app.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Creator: Sudhir Arvind Deshmukh
3
+ Run command: streamlit run app.py
4
+ '''
5
+ import streamlit as st
6
+ import spacy
7
+ from spacy.tokens import Doc
8
+ from spacy.training.example import Example
9
+ import datetime
10
+ import os
11
+ import random
12
+ import pandas as pd
13
+ from sklearn.model_selection import train_test_split
14
+ import matplotlib.pyplot as plt
15
+ import datetime
16
+ from transformers import AutoTokenizer, T5ForConditionalGeneration
17
+ from spacy import displacy
18
+
19
+ ## Load spaCy models from saved_models directory
20
+
21
+ # Get absolute path to the current script's directory
22
+ script_dir = os.path.dirname(os.path.abspath(__file__))
23
+ saved_models_dir = os.path.join(script_dir, "saved_models")
24
+ nlp_models = ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"] + [os.path.join(saved_models_dir, str(model_name)) for model_name in os.listdir(saved_models_dir)]
25
+
26
+ # fuction to load the csv file and extract sentences and tags
27
+ def load_data_from_csv(file):
28
+ df = pd.read_csv(file, encoding="latin-1")
29
+ df = df.dropna()
30
+ #df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
31
+ df.loc[:, "Sentence #"] = df["Sentence #"].ffill()
32
+ sentences = df.groupby("Sentence #")["Word"].apply(list).values
33
+ tags = df.groupby("Sentence #")["Tag"].apply(list).values
34
+ return sentences, tags
35
+
36
+
37
+
38
+ # Streamlit UI for Online Inference
39
+ def online_inference():
40
+ st.title("Online Inference")
41
+
42
+ selected_model = st.selectbox("Select base Model for finetunning", nlp_models)
43
+
44
+ # Load the selected spaCy model
45
+ # model_path = os.path.join(saved_models_dir, f"{selected_model}")
46
+ nlp = spacy.load(selected_model)
47
+
48
+
49
+ text_input = st.text_input("Enter Text for Inference")
50
+
51
+ if text_input:
52
+ doc = nlp(text_input)
53
+
54
+ # Filter out 'O' entities and get unique entity types
55
+ filtered_entities = [ent for ent in doc.ents if ent.label_ != 'O']
56
+ unique_entity_types = list(set(ent.label_ for ent in filtered_entities))
57
+
58
+ if filtered_entities:
59
+ # Define Google-themed colors for each entity type
60
+ color_dict = {
61
+ 'B-geo': '#4285F4', # Blue
62
+ 'B-gpe': '#EA4335', # Red
63
+ 'B-per': '#FBBC05', # Yellow
64
+ 'I-geo': '#0F9D58', # Green
65
+ 'B-org': '#34A853', # Green
66
+ 'I-org': '#FF9800', # Orange
67
+ 'B-tim': '#AA66CC', # Purple
68
+ 'B-art': '#FFC107', # Amber
69
+ 'I-art': '#9C27B0', # Purple
70
+ 'I-per': '#03A9F4', # Blue
71
+ 'I-gpe': '#009688', # Teal
72
+ 'I-tim': '#FF5722', # Deep Orange
73
+ 'B-nat': '#7B1FA2', # Deep Purple
74
+ 'B-eve': '#8BC34A', # Light Green
75
+ 'I-eve': '#FDD835', # Yellow
76
+ 'I-nat': '#616161' # Gray
77
+ }
78
+
79
+ # Render the visualization with custom colors
80
+ options = {"ents": unique_entity_types, "colors": color_dict}
81
+ html = spacy.displacy.render(doc, style="ent", options=options)
82
+ st.components.v1.html(html, height=400)
83
+ else:
84
+ st.write("No named entities found in the text.")
85
+
86
+ # Streamlit UI for Model Training
87
+ def model_training():
88
+
89
+ st.title("Model Training")
90
+
91
+ base_model = ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"]
92
+ selected_model = st.selectbox("Select base Model to Train", base_model)
93
+
94
+ # Define hyperparameters
95
+ learning_rate = st.slider("Learning Rate", min_value=0.001, max_value=0.1, step=0.001, value=0.01)
96
+ n_iter = st.slider("Number of Iterations", min_value=1, max_value=10, value=2)
97
+ dropout = st.slider("Dropout", min_value=0.1, max_value=0.9, step=0.1, value=0.5)
98
+
99
+ uploaded_file = st.file_uploader("Upload Training Data (CSV)", type="csv")
100
+
101
+ model_name_uniq = st.text_input("Enter Model Name")
102
+ if st.button("Train & Evaluate Model"):
103
+ if uploaded_file is not None:
104
+
105
+ # Load training data from the uploaded CSV file
106
+ sentences, tags = load_data_from_csv(uploaded_file)
107
+
108
+ # Split data into training, validation, and test sets
109
+ train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, tags, test_size=0.2, random_state=42)
110
+ train_sentences, val_sentences, train_tags, val_tags = train_test_split(train_sentences, train_tags, test_size=0.2, random_state=42)
111
+
112
+ print(f"Experimenting with model: {selected_model}")
113
+
114
+
115
+ # Load the pre-trained model
116
+ nlp = spacy.load(selected_model)
117
+
118
+ # Add or modify the NER component in the pipeline
119
+ if "ner" not in nlp.pipe_names:
120
+ ner = nlp.add_pipe("ner")
121
+ else:
122
+ ner = nlp.get_pipe("ner")
123
+
124
+ # Function to convert input format to spaCy format
125
+ def convert_to_spacy_format(sentences, tags):
126
+ examples = []
127
+ for sent, tag_list in zip(sentences, tags):
128
+ words = sent
129
+ spaces = [True] * len(words)
130
+ doc = Doc(nlp.vocab, words=words, spaces=spaces)
131
+ gold_entities = []
132
+ for token, tag in zip(doc, tag_list):
133
+ start = token.idx
134
+ end = start + len(token.text)
135
+ gold_entities.append((start, end, tag))
136
+ example = Example.from_dict(doc, {"entities": gold_entities})
137
+ examples.append(example)
138
+ return examples
139
+
140
+ # Add entity labels to the ner component
141
+ for label in set(tag for tag_list in tags for tag in tag_list):
142
+ ner.add_label(label)
143
+
144
+ # Create spaCy examples for training
145
+ train_examples = convert_to_spacy_format(train_sentences, train_tags)
146
+ val_examples = convert_to_spacy_format(val_sentences, val_tags)
147
+
148
+
149
+ # Lists to store learning curve data
150
+ train_losses = []
151
+ train_api_metrics = []
152
+ val_precisions = []
153
+ val_recalls = []
154
+
155
+ total_batches = len(train_examples) / 8
156
+ ner_metrics = []
157
+ # Train the NER model
158
+ for epoch in range(n_iter):
159
+ random.shuffle(train_examples)
160
+ st.write("this is iteration number:", epoch)
161
+ losses = {}
162
+ progress_bar = st.progress(0)
163
+ for batch_index, batch in enumerate(spacy.util.minibatch(train_examples, size=8), start=1):
164
+ nlp.update(batch, drop=dropout, losses=losses)
165
+ # Calculate progress percentage
166
+ progress_percentage = batch_index / (total_batches + 1)
167
+ progress_bar.progress(progress_percentage) # Display progress in Streamlit
168
+ train_losses.append(losses["ner"])
169
+ train_api_metrics.append(losses)
170
+
171
+ # Evaluate the model on the validation set
172
+ metrics = nlp.evaluate(val_examples)
173
+ val_precisions.append(metrics["ents_p"])
174
+ val_recalls.append(metrics["ents_r"])
175
+
176
+ # Append metrics to the ner_metrics list
177
+ ner_metrics.append(metrics)
178
+ print(val_precisions)
179
+ print(val_recalls)
180
+ current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
181
+ save_model_name = f"{model_name_uniq}_ner_model_{current_time}"
182
+ # Plot learning curve
183
+ plt.figure(figsize=(12, 4))
184
+ plt.plot(range(n_iter), train_losses, label="Training Loss")
185
+ plt.xlabel("Epoch")
186
+ plt.ylabel("Loss")
187
+ plt.title(f"Learning Curve for Model: {save_model_name}")
188
+ plt.legend()
189
+ learning_curve_plot_path = f"images/learning_curve_{save_model_name}.png"
190
+ plt.savefig(learning_curve_plot_path)
191
+ st.image(learning_curve_plot_path)
192
+
193
+ # # Plot Precision-Recall curve (Not straight forward with spacy therefore lets do Brert implementation)
194
+ # plt.figure(figsize=(12, 4))
195
+ # plt.plot(val_recalls, val_precisions, label="Precision-Recall Curve")
196
+ # plt.xlabel("Recall")
197
+ # plt.ylabel("Precision")
198
+ # plt.title(f"Precision-Recall Curve for Model: {save_model_name}")
199
+ # plt.legend()
200
+ # pr_curve_plot_path = f"images/precision_recall_curve_{save_model_name}.png"
201
+ # plt.savefig(pr_curve_plot_path)
202
+ # st.image(pr_curve_plot_path)
203
+
204
+ # Save the trained model to disk with timestamp
205
+
206
+ nlp.to_disk(os.path.join(saved_models_dir, str(save_model_name)))
207
+ st.success(f"Trained model saved as: {save_model_name}")
208
+
209
+ # Print important NER performance metrics
210
+ ner_performance_metrics = ["ents_p", "ents_r", "ents_f",
211
+ #"ents_per_type"
212
+ ]
213
+ # Print model performance metrics
214
+ st.write("---")
215
+ st.subheader("Evaluation Metrics on validation data (calculated during last epoch)")
216
+ for model_name, metrics in zip([selected_model], ner_metrics):
217
+ st.write(f"Model: {model_name}")
218
+ for metric_name in ner_performance_metrics:
219
+ metric_value = metrics.get(metric_name, 0.0)
220
+ st.write(f"{metric_name}: {metric_value}")
221
+ st.write("") # Add an empty line for spacing
222
+ st.write("---")
223
+ st.subheader("Performance Metrics on test data")
224
+
225
+ test_examples = convert_to_spacy_format(test_sentences, test_tags)
226
+ # Evaluate the model on the validation set
227
+
228
+ test_metrics = nlp.evaluate(test_examples)
229
+ # Print important NER performance metrics
230
+ # ner_performance_metrics = ["ents_p", "ents_r", "ents_f"]
231
+ # Print model performance metrics
232
+
233
+ for metric_name in ner_performance_metrics:
234
+ metric_value = test_metrics.get(metric_name, 0.0)
235
+ st.write(f"{metric_name}: {metric_value}")
236
+
237
+ st.write("---")
238
+
239
+ st.write(train_api_metrics)
240
+ st.write("training metric list of dicts")
241
+ st.write(ner_metrics)
242
+ st.write("training metric list of dicts")
243
+ st.write(test_metrics)
244
+
245
+ else:
246
+ st.warning("Please upload training data in CSV format.")
247
+
248
+ def gen_ai():
249
+
250
+
251
+ # Streamlit app layout
252
+ st.title("Few-Shot Named Entity Recognition with Flan")
253
+
254
+ # Load the Flan model
255
+ model_name = st.selectbox("Select Flan Model", ["google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"])
256
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
257
+
258
+ # Load a pre-trained tokenizer that's compatible with T5
259
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
260
+ st.write("---")
261
+ # User input for few-shot examples
262
+ st.subheader("Few-Shot Examples")
263
+ examples = []
264
+ num_examples = st.number_input("Number of Examples", min_value=1, value=2)
265
+ for _ in range(num_examples):
266
+ col1, col2 = st.columns([3, 1])
267
+ with col1:
268
+ example_text = st.text_input(f"Example {_+1} (Text)")
269
+ with col2:
270
+ example_label = st.text_input(f"Example {_+1} (Label)")
271
+ if example_text and example_label:
272
+ examples.append((example_text, example_label))
273
+ st.write("---")
274
+ # User input for query text
275
+ st.subheader("Query Text")
276
+ query = st.text_input("Enter Query Text")
277
+
278
+ # Detect Entities button
279
+ detect_button = st.button("Detect Entities")
280
+
281
+ # Generate named entities
282
+ if detect_button:
283
+ if not examples or not query:
284
+ st.warning('Need both examples and query as user input', icon="⚠️")
285
+ prompt = "\n".join([f"NER: {example[0]} Labels: {example[1]}" for example in examples])
286
+ prompt += f"\n{query} Labels:"
287
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
288
+ outputs = model.generate(input_ids, max_length=100, num_return_sequences=1)
289
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
290
+
291
+ # Process the generated output for displacy
292
+ entities = generated_text.split("Labels:")
293
+ entities = [e.strip().split(":")[0].strip() for e in entities if e.strip()]
294
+ st.write("---")
295
+ # Display identified named entities
296
+ st.subheader("Identified Named Entities:")
297
+
298
+ doc = {"text": query, "ents": [{"start": query.find(entity), "end": query.find(entity) + len(entity), "label": "Custom Entity"} for entity in entities], "title": None}
299
+ html = displacy.render(doc, style="ent", manual=True, minify=True)
300
+ st.components.v1.html(html)
301
+ st.write("---")
302
+ st.write(doc)
303
+
304
+
305
+ def ensure_folders_exist(script_dir):
306
+ images_path = os.path.join(script_dir, "images")
307
+ saved_model_path = os.path.join(script_dir, "saved_models")
308
+
309
+ # Create the 'images' directory if it doesn't exist
310
+ if not os.path.exists(images_path):
311
+ os.makedirs(images_path)
312
+
313
+ # Create the 'saved_model' directory if it doesn't exist
314
+ if not os.path.exists(saved_model_path):
315
+ os.makedirs(saved_model_path)
316
+
317
+ def main():
318
+
319
+ script_dir = os.path.dirname(os.path.abspath(__file__))
320
+
321
+ # Ensure that required folders exist
322
+ ensure_folders_exist(script_dir)
323
+
324
+ # Streamlit App
325
+ st.set_page_config(page_title="NER Model Experimentation")
326
+
327
+ st.sidebar.title("Navigation")
328
+ page = st.sidebar.radio("Go to", ["Online Inference", "Model Training",
329
+ #"Evaluation Metrics",
330
+ "GEN AI"])
331
+
332
+ if page == "Online Inference":
333
+ online_inference()
334
+ elif page == "Model Training":
335
+ model_training()
336
+ elif page == "GEN AI":
337
+ gen_ai()
338
+
339
+
340
+ # call main fuction
341
+ if __name__=="__main__":
342
+ main()
343
+
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ streamlit
4
+ matplotlib
5
+ scikit_learn
6
+ torch
7
+ transformers
8
+ spacy==3.6.1
9
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz
10
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0.tar.gz
11
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0.tar.gz
run.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """run the steamlit app through this python file"""
2
+
3
+ import os
4
+ import subprocess
5
+ import argparse
6
+
7
+ def ensure_folders_exist(script_dir):
8
+ images_path = os.path.join(script_dir, "images")
9
+ saved_model_path = os.path.join(script_dir, "saved_models")
10
+
11
+ # Create the 'images' directory if it doesn't exist
12
+ if not os.path.exists(images_path):
13
+ os.makedirs(images_path)
14
+
15
+ # Create the 'saved_model' directory if it doesn't exist
16
+ if not os.path.exists(saved_model_path):
17
+ os.makedirs(saved_model_path)
18
+
19
+ def run():
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--port", type=int, default=8501,
22
+ help="Port number for the Streamlit app")
23
+ args = parser.parse_args()
24
+
25
+ # Get absolute path to the current script's directory
26
+ script_dir = os.path.dirname(os.path.abspath(__file__))
27
+
28
+ # Ensure that required folders exist
29
+ ensure_folders_exist(script_dir)
30
+
31
+ # Construct paths for app.py, images, and saved_model directories
32
+ app_path = os.path.join(script_dir, "app.py")
33
+
34
+ # Run the Streamlit app defined at app_path
35
+ cmd = ["python", "-m", "streamlit", "run", "--server.port", str(args.port), app_path]
36
+ subprocess.call(cmd)
37
+
38
+ if __name__ == "__main__":
39
+ run()