faizhalas commited on
Commit
742d49f
·
verified ·
1 Parent(s): 6836bde

Update pages/5 Burst Detection.py

Browse files
Files changed (1) hide show
  1. pages/5 Burst Detection.py +9 -4
pages/5 Burst Detection.py CHANGED
@@ -135,7 +135,10 @@ def clean_data(df):
135
  df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
136
 
137
  # Vectorize processed text
138
- vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
 
 
 
139
  X = vectorizer.fit_transform(df['processed'].tolist())
140
 
141
  # Create DataFrame from the Document-Term Matrix (DTM)
@@ -350,14 +353,16 @@ uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
350
 
351
  if uploaded_file is not None:
352
  try:
353
- c1, c2, c3 = st.columns([3,3.5,3.5])
354
  top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
355
  viz_selected = c2.selectbox("Option for visualization",
356
  ("Line graph", "Scatter plot"), on_change=reset_all)
357
- running_total = c3.selectbox("Option for counting words",
358
  ("Running total", "By occurrences each year"), on_change=reset_all)
 
 
359
 
360
- d1, d2 = st.columns([3,7])
361
  df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
362
  col_name = d1.selectbox("Select column to analyze",
363
  (coldf), on_change=reset_all)
 
135
  df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
136
 
137
  # Vectorize processed text
138
+ if count_method == "Document Frequency":
139
+ vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split(), binary=True)
140
+ else:
141
+ vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
142
  X = vectorizer.fit_transform(df['processed'].tolist())
143
 
144
  # Create DataFrame from the Document-Term Matrix (DTM)
 
353
 
354
  if uploaded_file is not None:
355
  try:
356
+ c1, c2, c3, c4 = st.columns([2,2,3,3])
357
  top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
358
  viz_selected = c2.selectbox("Option for visualization",
359
  ("Line graph", "Scatter plot"), on_change=reset_all)
360
+ running_total = c3.selectbox("Calculation method",
361
  ("Running total", "By occurrences each year"), on_change=reset_all)
362
+ count_method = c4.selectbox("Count by",
363
+ ("Term Frequency", "Document Frequency"), on_change=reset_all)
364
 
365
+ d1, d2 = st.columns([2,8])
366
  df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
367
  col_name = d1.selectbox("Select column to analyze",
368
  (coldf), on_change=reset_all)