Olivier CARON commited on
Commit
b25dbae
β€’
1 Parent(s): 09c9ad7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -37
app.py CHANGED
@@ -7,36 +7,46 @@ from gliner import GLiNER
7
  from gliner_file import run_ner
8
  import time
9
 
10
- st.set_page_config(page_title="GliNER", page_icon="🧊", layout="wide", initial_sidebar_state="expanded")
 
 
 
11
 
12
  # Modified function to load data from either an Excel or CSV file
13
  @st.cache_data
14
  def load_data(file):
15
  _, file_ext = os.path.splitext(file.name)
16
- if file_ext.lower() in ['.xls', '.xlsx']:
17
  return pl.read_excel(file)
18
- elif file_ext.lower() == '.csv':
19
  file.seek(0) # Go back to the beginning of the file
20
  try:
21
- sample = file.read(4096).decode('utf-8') # Try to decode the sample in UTF-8
22
- encoding = 'utf-8'
 
 
23
  except UnicodeDecodeError:
24
- encoding = 'latin1' # Switch to 'latin1' if UTF-8 fails
25
  file.seek(0)
26
  sample = file.read(4096).decode(encoding)
27
-
28
  file.seek(0)
29
  dialect = csv.Sniffer().sniff(sample) # Detect the delimiter
30
 
31
  file.seek(0)
32
- if encoding != 'utf-8':
33
  file_content = file.read().decode(encoding)
34
  file = StringIO(file_content)
35
  else:
36
- file_content = file.read().decode('utf-8')
37
  file = StringIO(file_content)
38
-
39
- return pl.read_csv(file, separator=dialect.delimiter, truncate_ragged_lines=True, ignore_errors=True)
 
 
 
 
 
40
  else:
41
  raise ValueError("The uploaded file must be a CSV or Excel file.")
42
 
@@ -44,51 +54,60 @@ def load_data(file):
44
  # Function to perform NER and update the UI
45
  def perform_ner(filtered_df, selected_column, labels_list):
46
  ner_results_dict = {label: [] for label in labels_list}
47
-
48
  progress_bar = st.progress(0)
49
  progress_text = st.empty()
50
-
51
  start_time = time.time() # Record start time for total runtime
52
 
53
  for index, row in enumerate(filtered_df.to_pandas().itertuples(), 1):
54
  iteration_start_time = time.time() # Start time for this iteration
55
-
56
  if st.session_state.stop_processing:
57
  progress_text.text("Process stopped by the user.")
58
  break
59
 
60
  text_to_analyze = getattr(row, selected_column)
61
- ner_results = run_ner(st.session_state.gliner_model, text_to_analyze, labels_list)
 
 
62
 
63
  for label in labels_list:
64
  texts = ner_results.get(label, [])
65
- concatenated_texts = ', '.join(texts)
66
  ner_results_dict[label].append(concatenated_texts)
67
 
68
  progress = index / filtered_df.height
69
  progress_bar.progress(progress)
70
-
71
- iteration_time = time.time() - iteration_start_time # Calculate runtime for this iteration
 
 
72
  total_time = time.time() - start_time # Calculate total elapsed time so far
73
-
74
- progress_text.text(f"Progress: {index}/{filtered_df.height} - {progress * 100:.0f}% (Iteration: {iteration_time:.2f}s, Total: {total_time:.2f}s)")
 
 
75
 
76
  end_time = time.time() # Record end time
77
  total_execution_time = end_time - start_time # Calculate total runtime
78
-
79
- progress_text.text(f"Processing complete! Total execution time: {total_execution_time:.2f}s")
80
-
 
 
81
  for label, texts in ner_results_dict.items():
82
  filtered_df = filtered_df.with_columns(pl.Series(name=label, values=texts))
83
 
84
  return filtered_df
85
 
 
86
  def main():
87
  st.title("Online NER with GliNER")
88
  st.markdown("Prototype v0.1")
89
 
90
  # Ensure the stop_processing flag is initialized
91
- if 'stop_processing' not in st.session_state:
92
  st.session_state.stop_processing = False
93
 
94
  uploaded_file = st.sidebar.file_uploader("Choose a file")
@@ -104,9 +123,15 @@ def main():
104
 
105
  selected_column = st.selectbox("Select the column for NER:", df.columns, index=0)
106
  filter_text = st.text_input("Filter column by input text", "")
107
- ner_labels = st.text_input("Enter all your different labels, separated by a comma", "")
108
-
109
- filtered_df = df.filter(pl.col(selected_column).str.contains(f"(?i).*{filter_text}.*")) if filter_text else df
 
 
 
 
 
 
110
  st.dataframe(filtered_df)
111
 
112
  if st.button("Start NER"):
@@ -114,27 +139,35 @@ def main():
114
  st.warning("Please enter some labels for NER.")
115
  else:
116
  # Load GLiNER model if not already loaded
117
- if 'gliner_model' not in st.session_state:
118
- with st.spinner('Loading GLiNER model... Please wait.'):
119
- st.session_state.gliner_model = GLiNER.from_pretrained("urchade/gliner_largev2")
 
 
120
  st.session_state.gliner_model.eval()
121
-
122
  labels_list = ner_labels.split(",")
123
  updated_df = perform_ner(filtered_df, selected_column, labels_list)
124
  st.dataframe(updated_df)
125
 
126
  def to_excel(df):
127
  output = BytesIO()
128
- df.to_pandas().to_excel(output, index=False, engine='openpyxl')
129
  return output.getvalue()
130
 
131
  df_excel = to_excel(updated_df)
132
- st.download_button(label="πŸ“₯ Download Excel",
133
- data=df_excel,
134
- file_name="ner_results.xlsx",
135
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
 
 
 
 
 
 
 
136
 
137
- st.button("Stop Processing", on_click=lambda: setattr(st.session_state, 'stop_processing', True))
138
 
139
  if __name__ == "__main__":
140
  main()
 
7
  from gliner_file import run_ner
8
  import time
9
 
10
+ st.set_page_config(
11
+ page_title="GliNER", page_icon="🧊", layout="wide", initial_sidebar_state="expanded"
12
+ )
13
+
14
 
15
  # Modified function to load data from either an Excel or CSV file
16
  @st.cache_data
17
  def load_data(file):
18
  _, file_ext = os.path.splitext(file.name)
19
+ if file_ext.lower() in [".xls", ".xlsx"]:
20
  return pl.read_excel(file)
21
+ elif file_ext.lower() == ".csv":
22
  file.seek(0) # Go back to the beginning of the file
23
  try:
24
+ sample = file.read(4096).decode(
25
+ "utf-8"
26
+ ) # Try to decode the sample in UTF-8
27
+ encoding = "utf-8"
28
  except UnicodeDecodeError:
29
+ encoding = "latin1" # Switch to 'latin1' if UTF-8 fails
30
  file.seek(0)
31
  sample = file.read(4096).decode(encoding)
32
+
33
  file.seek(0)
34
  dialect = csv.Sniffer().sniff(sample) # Detect the delimiter
35
 
36
  file.seek(0)
37
+ if encoding != "utf-8":
38
  file_content = file.read().decode(encoding)
39
  file = StringIO(file_content)
40
  else:
41
+ file_content = file.read().decode("utf-8")
42
  file = StringIO(file_content)
43
+
44
+ return pl.read_csv(
45
+ file,
46
+ separator=dialect.delimiter,
47
+ truncate_ragged_lines=True,
48
+ ignore_errors=True,
49
+ )
50
  else:
51
  raise ValueError("The uploaded file must be a CSV or Excel file.")
52
 
 
54
  # Function to perform NER and update the UI
55
  def perform_ner(filtered_df, selected_column, labels_list):
56
  ner_results_dict = {label: [] for label in labels_list}
57
+
58
  progress_bar = st.progress(0)
59
  progress_text = st.empty()
60
+
61
  start_time = time.time() # Record start time for total runtime
62
 
63
  for index, row in enumerate(filtered_df.to_pandas().itertuples(), 1):
64
  iteration_start_time = time.time() # Start time for this iteration
65
+
66
  if st.session_state.stop_processing:
67
  progress_text.text("Process stopped by the user.")
68
  break
69
 
70
  text_to_analyze = getattr(row, selected_column)
71
+ ner_results = run_ner(
72
+ st.session_state.gliner_model, text_to_analyze, labels_list
73
+ )
74
 
75
  for label in labels_list:
76
  texts = ner_results.get(label, [])
77
+ concatenated_texts = ", ".join(texts)
78
  ner_results_dict[label].append(concatenated_texts)
79
 
80
  progress = index / filtered_df.height
81
  progress_bar.progress(progress)
82
+
83
+ iteration_time = (
84
+ time.time() - iteration_start_time
85
+ ) # Calculate runtime for this iteration
86
  total_time = time.time() - start_time # Calculate total elapsed time so far
87
+
88
+ progress_text.text(
89
+ f"Progress: {index}/{filtered_df.height} - {progress * 100:.0f}% (Iteration: {iteration_time:.2f}s, Total: {total_time:.2f}s)"
90
+ )
91
 
92
  end_time = time.time() # Record end time
93
  total_execution_time = end_time - start_time # Calculate total runtime
94
+
95
+ progress_text.text(
96
+ f"Processing complete! Total execution time: {total_execution_time:.2f}s"
97
+ )
98
+
99
  for label, texts in ner_results_dict.items():
100
  filtered_df = filtered_df.with_columns(pl.Series(name=label, values=texts))
101
 
102
  return filtered_df
103
 
104
+
105
  def main():
106
  st.title("Online NER with GliNER")
107
  st.markdown("Prototype v0.1")
108
 
109
  # Ensure the stop_processing flag is initialized
110
+ if "stop_processing" not in st.session_state:
111
  st.session_state.stop_processing = False
112
 
113
  uploaded_file = st.sidebar.file_uploader("Choose a file")
 
123
 
124
  selected_column = st.selectbox("Select the column for NER:", df.columns, index=0)
125
  filter_text = st.text_input("Filter column by input text", "")
126
+ ner_labels = st.text_input(
127
+ "Enter all your different labels, separated by a comma", ""
128
+ )
129
+
130
+ filtered_df = (
131
+ df.filter(pl.col(selected_column).str.contains(f"(?i).*{filter_text}.*"))
132
+ if filter_text
133
+ else df
134
+ )
135
  st.dataframe(filtered_df)
136
 
137
  if st.button("Start NER"):
 
139
  st.warning("Please enter some labels for NER.")
140
  else:
141
  # Load GLiNER model if not already loaded
142
+ if "gliner_model" not in st.session_state:
143
+ with st.spinner("Loading GLiNER model... Please wait."):
144
+ st.session_state.gliner_model = GLiNER.from_pretrained(
145
+ "urchade/gliner_largev2"
146
+ )
147
  st.session_state.gliner_model.eval()
148
+
149
  labels_list = ner_labels.split(",")
150
  updated_df = perform_ner(filtered_df, selected_column, labels_list)
151
  st.dataframe(updated_df)
152
 
153
  def to_excel(df):
154
  output = BytesIO()
155
+ df.to_pandas().to_excel(output, index=False, engine="openpyxl")
156
  return output.getvalue()
157
 
158
  df_excel = to_excel(updated_df)
159
+ st.download_button(
160
+ label="πŸ“₯ Download Excel",
161
+ data=df_excel,
162
+ file_name="ner_results.xlsx",
163
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
164
+ )
165
+
166
+ st.button(
167
+ "Stop Processing",
168
+ on_click=lambda: setattr(st.session_state, "stop_processing", True),
169
+ )
170
 
 
171
 
172
  if __name__ == "__main__":
173
  main()