vonewman commited on
Commit
3f67191
·
1 Parent(s): e12504f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -40
app.py CHANGED
@@ -27,21 +27,14 @@ def load_model():
27
  return trainer, model, tokenizer
28
 
29
  def align_word_ids(texts):
30
-
31
  trainer, model, tokenizer = load_model()
32
-
33
  tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True)
34
-
35
  word_ids = tokenized_inputs.word_ids()
36
-
37
  previous_word_idx = None
38
  label_ids = []
39
-
40
  for word_idx in word_ids:
41
-
42
  if word_idx is None:
43
  label_ids.append(-100)
44
-
45
  elif word_idx != previous_word_idx:
46
  try:
47
  label_ids.append(1)
@@ -53,54 +46,30 @@ def align_word_ids(texts):
53
  except:
54
  label_ids.append(-100)
55
  previous_word_idx = word_idx
56
-
57
  return label_ids
58
 
59
-
60
  def predict_ner_labels(model, tokenizer, sentence):
61
  use_cuda = torch.cuda.is_available()
62
  device = torch.device("cuda" if use_cuda else "cpu")
63
-
64
  if use_cuda:
65
  model = model.cuda()
66
-
67
  text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
68
  mask = text['attention_mask'].to(device)
69
  input_id = text['input_ids'].to(device)
70
  label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
71
-
72
  logits = model(input_id, mask, None)
73
  logits_clean = logits[0][label_ids != -100]
74
-
75
  predictions = logits_clean.argmax(dim=1).tolist()
76
  prediction_label = [id2tag[i] for i in predictions]
77
-
78
  return prediction_label
79
 
80
  id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'}
81
 
82
-
83
  def tag_sentence(text):
84
  trainer, model, tokenizer = load_model()
85
-
86
- # Utilisez votre modèle pour prédire les tags
87
  predictions = predict_ner_labels(model, tokenizer, text)
88
-
89
- # Créez un DataFrame avec les colonnes "word" et "tag"
90
- df = pd.DataFrame({'word': text.split(), 'tag': predictions})
91
-
92
- # Remplacez les étiquettes par des valeurs numériques
93
- df['tag'] = df['tag'].map(id2tag)
94
-
95
- # Appliquez une mise en forme conditionnelle pour colorier les tags dans le texte
96
- def color_tags(tag):
97
- if tag == 'O':
98
- return ''
99
- else:
100
- return 'color: blue'
101
-
102
- df['word'] = df.apply(lambda row: f'<span style="{color_tags(row["tag"])}">{row["word"]}</span>', axis=1)
103
-
104
  return df
105
 
106
  st.title("📘 Named Entity Recognition Wolof")
@@ -117,11 +86,8 @@ if submit_button:
117
  else:
118
  st.markdown("### Tagged Sentence")
119
  st.header("")
120
-
121
  results = tag_sentence(x1)
122
-
123
  cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
124
-
125
  with c1:
126
  csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
127
  file_name="results.csv", mime='text/csv', key='csv')
@@ -131,13 +97,10 @@ if submit_button:
131
  with c3:
132
  jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
133
  file_name="results.json", mime='application/json', key='json')
134
-
135
  st.header("")
136
-
137
  c1, c2, c3 = st.columns([1, 3, 1])
138
-
139
  with c2:
140
- st.write(results.to_html(escape=False), unsafe_allow_html=True)
141
 
142
  st.header("")
143
  st.header("")
 
27
  return trainer, model, tokenizer
28
 
29
  def align_word_ids(texts):
 
30
  trainer, model, tokenizer = load_model()
 
31
  tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True)
 
32
  word_ids = tokenized_inputs.word_ids()
 
33
  previous_word_idx = None
34
  label_ids = []
 
35
  for word_idx in word_ids:
 
36
  if word_idx is None:
37
  label_ids.append(-100)
 
38
  elif word_idx != previous_word_idx:
39
  try:
40
  label_ids.append(1)
 
46
  except:
47
  label_ids.append(-100)
48
  previous_word_idx = word_idx
 
49
  return label_ids
50
 
 
51
  def predict_ner_labels(model, tokenizer, sentence):
52
  use_cuda = torch.cuda.is_available()
53
  device = torch.device("cuda" if use_cuda else "cpu")
 
54
  if use_cuda:
55
  model = model.cuda()
 
56
  text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
57
  mask = text['attention_mask'].to(device)
58
  input_id = text['input_ids'].to(device)
59
  label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
 
60
  logits = model(input_id, mask, None)
61
  logits_clean = logits[0][label_ids != -100]
 
62
  predictions = logits_clean.argmax(dim=1).tolist()
63
  prediction_label = [id2tag[i] for i in predictions]
 
64
  return prediction_label
65
 
66
  id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'}
67
 
 
68
  def tag_sentence(text):
69
  trainer, model, tokenizer = load_model()
 
 
70
  predictions = predict_ner_labels(model, tokenizer, text)
71
+ # Créez un DataFrame avec les colonnes "words" et "tags"
72
+ df = pd.DataFrame({'words': text.split(), 'tags': predictions})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return df
74
 
75
  st.title("📘 Named Entity Recognition Wolof")
 
86
  else:
87
  st.markdown("### Tagged Sentence")
88
  st.header("")
 
89
  results = tag_sentence(x1)
 
90
  cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
 
91
  with c1:
92
  csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
93
  file_name="results.csv", mime='text/csv', key='csv')
 
97
  with c3:
98
  jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
99
  file_name="results.json", mime='application/json', key='json')
 
100
  st.header("")
 
101
  c1, c2, c3 = st.columns([1, 3, 1])
 
102
  with c2:
103
+ st.table(results[['words', 'tags']])
104
 
105
  st.header("")
106
  st.header("")