ruanchaves commited on
Commit
cc99942
1 Parent(s): 8907e41

hashtag segmentation

Browse files
Files changed (1) hide show
  1. app.py +16 -12
app.py CHANGED
@@ -11,14 +11,16 @@ Hashtag segmentation is the task of automatically adding spaces between the word
11
  This app uses the <a href=\"https://github.com/ruanchaves/hashformers\">Hashformers library</a> to suggest segmentations for hashtags.
12
 
13
  Enter a hashtag or pick one from the examples below. The app will suggest the best segmentation for the hashtag.
 
 
14
  """
15
 
16
  app_examples = [
17
- ["#cristianoronaldo", "cristian o ronaldo", "portuguese"],
18
- ["#madridsinfiltros", "", "spanish"],
19
- ["#kuenstlicheintelligenz", "kuenstliche intelligenz", "german"],
20
- ["#dadscare", "dad scare, dads care", "english (fast)"],
21
- ["#nowthatcherisdead", "now that cher is dead, now thatcher is dead", "english"],
22
  ]
23
 
24
  output_json_component_description = {"": ""}
@@ -86,7 +88,7 @@ def parse_candidates(candidates):
86
  candidates = [c.strip() for c in candidates]
87
  return candidates
88
 
89
- def predict(s1, candidates, language, use_reranker, topk, steps):
90
  hashtag_list = [s1]
91
  if language:
92
  chosen_model = model_dict[language]
@@ -100,13 +102,16 @@ def predict(s1, candidates, language, use_reranker, topk, steps):
100
  segmenter_df = format_dataframe(segmentation.segmenter_rank)
101
  reranker_df = format_dataframe(segmentation.reranker_rank)
102
 
 
 
 
 
103
 
104
  top_segmentation = segmentation.output[0]
105
  segmenter_score_dict = convert_to_score_dict(segmenter_df)
106
  reranker_score_dict = convert_to_score_dict(reranker_df)
107
  top_segmentation_df = get_candidates_df([top_segmentation], segmenter_score_dict, reranker_score_dict)
108
-
109
- candidates_list = parse_candidates(candidates)
110
 
111
  candidates_df = get_candidates_df(candidates_list, segmenter_score_dict, reranker_score_dict)
112
  output_df = pd.concat([top_segmentation_df, candidates_df], axis=0)
@@ -123,16 +128,15 @@ def predict(s1, candidates, language, use_reranker, topk, steps):
123
 
124
  inputs = [
125
  gr.Textbox(label="Hashtag"),
126
- gr.Textbox(label="Candidate segmentations"),
127
  gr.Dropdown(language_list, label="Language", value="english (fast)"),
128
  gr.Checkbox(label="Use reranker", value=True),
129
- gr.Slider(0, 100, value=20, label="Advanced setting - Beamsearch top-k"),
130
- gr.Slider(0, 100, value=13, label="Advanced setting - Beamsearch steps")
131
  ]
132
 
133
  outputs = [
134
  gr.Textbox(label="Suggested segmentation"),
135
- gr.DataFrame(label="Scores"),
136
  ]
137
 
138
 
 
11
  This app uses the <a href=\"https://github.com/ruanchaves/hashformers\">Hashformers library</a> to suggest segmentations for hashtags.
12
 
13
  Enter a hashtag or pick one from the examples below. The app will suggest the best segmentation for the hashtag.
14
+
15
+ In the advanced settings, decreasing the slider values will make the app faster, but it may also reduce its accuracy.
16
  """
17
 
18
  app_examples = [
19
+ ["#cristianoronaldo", "portuguese"],
20
+ ["#madridsinfiltros", "spanish"],
21
+ ["#kuenstlicheintelligenz", "german"],
22
+ ["#dadscare", "english (fast)"],
23
+ ["#nowthatcherisdead", "english"],
24
  ]
25
 
26
  output_json_component_description = {"": ""}
 
88
  candidates = [c.strip() for c in candidates]
89
  return candidates
90
 
91
+ def predict(s1, language, use_reranker, topk, steps):
92
  hashtag_list = [s1]
93
  if language:
94
  chosen_model = model_dict[language]
 
102
  segmenter_df = format_dataframe(segmentation.segmenter_rank)
103
  reranker_df = format_dataframe(segmentation.reranker_rank)
104
 
105
+ if not use_reranker:
106
+ candidates_list = segmenter_df.head(3)["segmentation"].tolist()
107
+ else:
108
+ candidates_list = reranker_df.head(3)["segmentation"].tolist()
109
 
110
  top_segmentation = segmentation.output[0]
111
  segmenter_score_dict = convert_to_score_dict(segmenter_df)
112
  reranker_score_dict = convert_to_score_dict(reranker_df)
113
  top_segmentation_df = get_candidates_df([top_segmentation], segmenter_score_dict, reranker_score_dict)
114
+
 
115
 
116
  candidates_df = get_candidates_df(candidates_list, segmenter_score_dict, reranker_score_dict)
117
  output_df = pd.concat([top_segmentation_df, candidates_df], axis=0)
 
128
 
129
  inputs = [
130
  gr.Textbox(label="Hashtag"),
 
131
  gr.Dropdown(language_list, label="Language", value="english (fast)"),
132
  gr.Checkbox(label="Use reranker", value=True),
133
+ gr.Slider(0, 100, value=20, label="Advanced setting - Beamsearch: Number of beams"),
134
+ gr.Slider(0, 100, value=13, label="Advanced setting - Maximum number of spaces allowed")
135
  ]
136
 
137
  outputs = [
138
  gr.Textbox(label="Suggested segmentation"),
139
+ gr.DataFrame(label="Top alternatives"),
140
  ]
141
 
142