pmkhanh7890 commited on
Commit
26e3944
·
1 Parent(s): 56cf7e3

add 2 more screen for ordinary and governor users

Browse files
application_2.py CHANGED
@@ -8,10 +8,6 @@ from src.application.content_detection import NewsVerification
8
  from src.application.url_reader import URLReader
9
  from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
10
 
11
-
12
- GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
13
- SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
14
-
15
  AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
16
  AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
17
 
@@ -52,7 +48,7 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
52
  # Define the GUI
53
  with gr.Blocks() as demo:
54
  gr.Markdown("# NEWS VERIFICATION")
55
-
56
  with gr.Row():
57
  # SETTINGS
58
  with gr.Column(scale=1):
@@ -88,15 +84,56 @@ with gr.Blocks() as demo:
88
  news_content = gr.Textbox(label="Content", value="", lines=13)
89
 
90
  # NEWS ANALYSIS REPORT
91
- explanation = """
 
92
  - Green texts are the matched words in the input and source news.<br>
93
  - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
94
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  with gr.Column(scale=2):
96
- with gr.Accordion("News Analysis"):
97
- gr.HTML(explanation)
98
- detection_button = gr.Button("Verify news")
99
- detailed_analysis = gr.HTML("<br>"*40)
 
 
 
 
 
 
 
100
 
101
  # Connect events
102
  load_button.click(
@@ -113,9 +150,9 @@ with gr.Blocks() as demo:
113
  generate_image_button.click(generate_fake_image,
114
  inputs=[image_generation_model, news_title],
115
  outputs=[news_image])
116
- detection_button.click(generate_analysis_report,
117
  inputs=[news_title, news_content, news_image],
118
- outputs=[detailed_analysis])
119
 
120
  # change Image
121
  #url_input.change(load_image, inputs=url_input, outputs=image_view)
 
8
  from src.application.url_reader import URLReader
9
  from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
10
 
 
 
 
 
11
  AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
12
  AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
13
 
 
48
  # Define the GUI
49
  with gr.Blocks() as demo:
50
  gr.Markdown("# NEWS VERIFICATION")
51
+
52
  with gr.Row():
53
  # SETTINGS
54
  with gr.Column(scale=1):
 
84
  news_content = gr.Textbox(label="Content", value="", lines=13)
85
 
86
  # NEWS ANALYSIS REPORT
87
+ ordinary_user_explanation = """
88
+ FOR ORDINARY USER<br>
89
  - Green texts are the matched words in the input and source news.<br>
90
  - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
91
  """
92
+ fact_checker_explanation = """
93
+ FOR FACT CHECKER<br>
94
+ - Green texts are the matched words in the input and source news.<br>
95
+ - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
96
+ """
97
+ governor_explanation = """
98
+ FOR GOVERNOR<br>
99
+ - Green texts are the matched words in the input and source news.<br>
100
+ - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
101
+ """
102
+ table = """
103
+ <h5>Comparison between input news and source news</h5>
104
+ <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
105
+ <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
106
+ <thead>
107
+ <tr>
108
+ <th>Input news</th>
109
+ <th>Source (URL provided in Originality column correspondingly)</th>
110
+ <th>Forensic</th>
111
+ <th>Originality</th>
112
+ </tr>
113
+ </thead>
114
+ <tbody>
115
+ <tr>
116
+ <th>Input 1</th>
117
+ <th>Source 1(URL provided in Originality column correspondingly)</th>
118
+ <th>Forensic 1</th>
119
+ <th>Originality 1</th>
120
+ </tr>
121
+ </tbody>
122
+ </table>
123
+
124
+ <style>"""
125
  with gr.Column(scale=2):
126
+ with gr.Accordion("NEWS ANALYSIS"):
127
+ verification_button = gr.Button("Verify news")
128
+ with gr.Tab("Orinary User"):
129
+ gr.HTML(ordinary_user_explanation)
130
+ ordinary_user_result = gr.HTML(table)
131
+ with gr.Tab("Fact Checker"):
132
+ gr.HTML(fact_checker_explanation)
133
+ fact_checker_result = gr.HTML("<br>"*40)
134
+ with gr.Tab("Governor"):
135
+ gr.HTML(fact_checker_explanation)
136
+ governor_result = gr.HTML(table)
137
 
138
  # Connect events
139
  load_button.click(
 
150
  generate_image_button.click(generate_fake_image,
151
  inputs=[image_generation_model, news_title],
152
  outputs=[news_image])
153
+ verification_button.click(generate_analysis_report,
154
  inputs=[news_title, news_content, news_image],
155
+ outputs=[ordinary_user_result, fact_checker_result, governor_result])
156
 
157
  # change Image
158
  #url_input.change(load_image, inputs=url_input, outputs=image_view)
examples/example_text_LLM_modification.txt CHANGED
@@ -1 +1,3 @@
1
- Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe. He made a substitute appearance and waved farewell to fans in Newcastle's recent win against Southampton. Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2022-23, and scored against Paris St-Germain in the Champions League.
 
 
 
1
+ Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
2
+ He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
3
+ Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.
src/application/content_detection.py CHANGED
@@ -1,10 +1,9 @@
1
  from difflib import SequenceMatcher
2
- import difflib
3
- import string
4
  from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
5
  from src.application.text.entity import apply_highlight, highlight_entities
 
6
  from src.application.text.model_detection import detect_text_by_ai_model
7
- from src.application.text.preprocessing import split_into_sentences
8
  from src.application.text.search_detection import check_human, detect_text_by_relative_search
9
 
10
 
@@ -27,7 +26,10 @@ class NewsVerification():
27
  self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
28
  self.aligned_sentences:list[dict] = []
29
  self.is_paraphrased:list[bool] = []
30
- self.analyzed_table:list[list] = []
 
 
 
31
 
32
  def load_news(self, news_title, news_content, news_image):
33
  self.news_text = news_title + "\n\n" + news_content
@@ -50,7 +52,7 @@ class NewsVerification():
50
  print("CHECK TEXT:")
51
  print("\tFrom search engine:")
52
  # Classify by search engine
53
- input_sentences = split_into_sentences(self.news_text)
54
  current_index = 0
55
  previous_paraphrase = None
56
  ai_sentence = {
@@ -87,7 +89,7 @@ class NewsVerification():
87
  else:
88
  if previous_paraphrase is False or previous_paraphrase is None:
89
  # add ai_sentences to align_sentences
90
- if ai_sentence["input_sentence"] != "":
91
  text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
92
  ai_sentence["label"] = text_prediction_label
93
  ai_sentence["similarity"] = text_prediction_score
@@ -102,7 +104,7 @@ class NewsVerification():
102
  "paraphrase": False,
103
  "url": "",
104
  }
105
-
106
  # add searched_sentences to align_sentences
107
  if searched_sentences["input_sentence"] != "":
108
  self.found_img_url.extend(img_urls)
@@ -181,63 +183,11 @@ class NewsVerification():
181
  self.detect_image_origin()
182
 
183
  def analyze_details(self):
184
- self.analyzed_table = []
 
 
185
 
186
- for aligned_sentence in self.aligned_sentences:
187
- if "input_sentence" not in aligned_sentence:
188
- continue
189
-
190
- # Get index of equal phrases in input and source sentences
191
- equal_idx_1, equal_idx_2 = self.extract_equal_text(
192
- aligned_sentence["input_sentence"],
193
- aligned_sentence["matched_sentence"],
194
- )
195
-
196
- # Get entity-words (in pair) with colors
197
- entities_with_colors = highlight_entities(
198
- aligned_sentence["input_sentence"],
199
- aligned_sentence["matched_sentence"],
200
- )
201
-
202
- self.analyzed_table.append(
203
- [
204
- aligned_sentence["input_sentence"],
205
- aligned_sentence["matched_sentence"],
206
- equal_idx_1,
207
- equal_idx_2,
208
- entities_with_colors,
209
- ]
210
- )
211
-
212
- if len(self.analyzed_table) != 0:
213
- html_table = self.create_table()
214
- else:
215
- html_table = ""
216
- return html_table
217
-
218
- def extract_equal_text(self, text1, text2):
219
- def cleanup(text):
220
- text = text.lower()
221
- text = text.translate(str.maketrans('', '', string.punctuation))
222
- return text
223
-
224
- splited_text1 = cleanup(text1).split()
225
- splited_text2 = cleanup(text2).split()
226
-
227
- s = SequenceMatcher(None, splited_text1, splited_text2)
228
-
229
- equal_idx_1 = []
230
- equal_idx_2 = []
231
- text1 = text1.split()
232
- text2 = text2.split()
233
- for tag, i1, i2, j1, j2 in s.get_opcodes():
234
- if tag == 'equal':
235
- equal_idx_1.append({"start": i1, "end": i2})
236
- equal_idx_2.append({"start": j1, "end": j2})
237
- # subtext_1 = " ".join(text1[i1:i2])
238
- # subtext_2 = " ".join(text2[j1:j2])
239
- # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
240
- return equal_idx_1, equal_idx_2
241
 
242
  def get_text_urls(self):
243
  return set(self.text_referent_url)
@@ -264,7 +214,7 @@ class NewsVerification():
264
  if not sentence_1 or not sentence_2: # Handle empty strings
265
  return []
266
 
267
- s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
268
  common_phrases = []
269
 
270
  for block in s.get_matching_blocks():
@@ -287,16 +237,40 @@ class NewsVerification():
287
  position += len(sentence_1)
288
  return common_phrases, position
289
 
290
- def create_table(self):
291
- #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
292
- # loop of self.analyzed_table with index:
293
  rows = []
294
  max_length = 30 # TODO: put this in configuration
295
- rows.append(self.format_image_row(max_length))
296
 
297
- for index, row in enumerate(self.analyzed_table):
298
- formatted_row = self.format_text_row(row, index, max_length)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  rows.append(formatted_row)
 
300
  table = "\n".join(rows)
301
  return f"""
302
  <h5>Comparison between input news and source news</h5>
@@ -316,29 +290,29 @@ class NewsVerification():
316
 
317
  <style>
318
  """
319
-
320
- def format_text_row(self, row, index = 0, max_length=30):
321
- if row[1] != "": # source is not empty
 
 
322
  # highlight entities
323
- input_sentence, highlight_idx_input = apply_highlight(row[0], row[4], "input")
324
- source_sentence, highlight_idx_source = apply_highlight(row[1], row[4], "source")
325
- print(f"highlighted_input: {input_sentence}")
326
 
327
  # Color overlapping words
328
- input_sentence = self.color_text(input_sentence, row[2], highlight_idx_input) # text, index of highlight words
329
- source_sentence = self.color_text(source_sentence, row[3], highlight_idx_source) # text, index of highlight words
330
- print(f"input_sentence: {input_sentence}")
331
 
332
  input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
333
  source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
334
  else:
335
- input_sentence = row[0]
336
- source_sentence = row[1]
337
 
338
- label = self.aligned_sentences[index]["label"]
339
- score = self.aligned_sentences[index]["similarity"]
340
 
341
- url = self.aligned_sentences[index]["url"] #
342
  short_url = self.shorten_url(url, max_length)
343
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
344
 
@@ -351,8 +325,7 @@ class NewsVerification():
351
  </tr>
352
  """
353
 
354
- def format_image_row(self, max_length=30):
355
- # input_image = f"""<img src="example_image_input.jpg" width="200" height="150">"""
356
 
357
  if self.image_referent_url is not None or self.image_referent_url != "":
358
  source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
@@ -363,7 +336,216 @@ class NewsVerification():
363
  source_image_url = ""
364
 
365
  return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
366
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  def shorten_url(self, url, max_length=30):
368
  if url is None:
369
  return ""
@@ -374,23 +556,19 @@ class NewsVerification():
374
  short_url = url
375
  return short_url
376
 
 
377
  def color_text(self, text, colored_idx, highlighted_idx):
378
  paragraph = ""
379
  words = text.split()
380
 
381
  starts, ends = self.extract_starts_ends(colored_idx)
382
  starts, ends = self.filter_indices(starts, ends, highlighted_idx)
383
- print(f"highlighted_idx: {highlighted_idx}")
384
- print(f"starts_2: {starts}")
385
- print(f"ends_2: {ends}")
386
  previous_end = 0
387
  for start, end in zip(starts, ends):
388
  paragraph += " ".join(words[previous_end:start])
389
 
390
  equal_words = " ".join(words[start:end])
391
- print(f"starts_2: {start}")
392
- print(f"ends_2: {end}")
393
- print(f"equal_words: {words[start:end]}")
394
  paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
395
 
396
  previous_end = end
 
1
  from difflib import SequenceMatcher
 
 
2
  from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
3
  from src.application.text.entity import apply_highlight, highlight_entities
4
+ from src.application.text.helper import extract_equal_text
5
  from src.application.text.model_detection import detect_text_by_ai_model
6
+ from src.application.text.preprocessing import split_into_paragraphs
7
  from src.application.text.search_detection import check_human, detect_text_by_relative_search
8
 
9
 
 
26
  self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
27
  self.aligned_sentences:list[dict] = []
28
  self.is_paraphrased:list[bool] = []
29
+
30
+ self.ordinary_user_table:list = []
31
+ self.fact_checker_table:list = []
32
+ self.governor_table:list = []
33
 
34
  def load_news(self, news_title, news_content, news_image):
35
  self.news_text = news_title + "\n\n" + news_content
 
52
  print("CHECK TEXT:")
53
  print("\tFrom search engine:")
54
  # Classify by search engine
55
+ input_sentences = split_into_paragraphs(self.news_text)
56
  current_index = 0
57
  previous_paraphrase = None
58
  ai_sentence = {
 
89
  else:
90
  if previous_paraphrase is False or previous_paraphrase is None:
91
  # add ai_sentences to align_sentences
92
+ if ai_sentence["input_sentence"] != "" or current_index >= len(input_sentences):
93
  text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
94
  ai_sentence["label"] = text_prediction_label
95
  ai_sentence["similarity"] = text_prediction_score
 
104
  "paraphrase": False,
105
  "url": "",
106
  }
107
+
108
  # add searched_sentences to align_sentences
109
  if searched_sentences["input_sentence"] != "":
110
  self.found_img_url.extend(img_urls)
 
183
  self.detect_image_origin()
184
 
185
  def analyze_details(self):
186
+ ordinary_user_table = self.create_ordinary_user_table()
187
+ fact_checker_table = self.create_fact_checker_table()
188
+ governor_table = self.create_governor_table()
189
 
190
+ return ordinary_user_table, fact_checker_table, governor_table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  def get_text_urls(self):
193
  return set(self.text_referent_url)
 
214
  if not sentence_1 or not sentence_2: # Handle empty strings
215
  return []
216
 
217
+ s = SequenceMatcher(None, sentence_1, sentence_2)
218
  common_phrases = []
219
 
220
  for block in s.get_matching_blocks():
 
237
  position += len(sentence_1)
238
  return common_phrases, position
239
 
240
+ def create_fact_checker_table(self):
 
 
241
  rows = []
242
  max_length = 30 # TODO: put this in configuration
243
+ rows.append(self.format_image_fact_checker_row(max_length))
244
 
245
+ for aligned_sentence in self.aligned_sentences:
246
+ if "input_sentence" not in aligned_sentence:
247
+ continue
248
+
249
+ # Get index of equal phrases in input and source sentences
250
+ equal_idx_1, equal_idx_2 = extract_equal_text(
251
+ aligned_sentence["input_sentence"],
252
+ aligned_sentence["matched_sentence"],
253
+ )
254
+
255
+ # Get entity-words (in pair) with colors
256
+ entities_with_colors = highlight_entities(
257
+ aligned_sentence["input_sentence"],
258
+ aligned_sentence["matched_sentence"],
259
+ )
260
+
261
+ self.fact_checker_table.append(
262
+ [
263
+ aligned_sentence,
264
+ equal_idx_1,
265
+ equal_idx_2,
266
+ entities_with_colors,
267
+ ]
268
+ )
269
+
270
+ for row in self.fact_checker_table:
271
+ formatted_row = self.format_text_fact_checker_row(row, max_length)
272
  rows.append(formatted_row)
273
+
274
  table = "\n".join(rows)
275
  return f"""
276
  <h5>Comparison between input news and source news</h5>
 
290
 
291
  <style>
292
  """
293
+
294
+ def format_text_fact_checker_row(self, row, max_length=30):
295
+ if row[0]["input_sentence"] == "":
296
+ return ""
297
+ if row[0]["matched_sentence"] != "": # source is not empty
298
  # highlight entities
299
+ input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
300
+ source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
 
301
 
302
  # Color overlapping words
303
+ input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
304
+ source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source) # text, index of highlight words
 
305
 
306
  input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
307
  source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
308
  else:
309
+ input_sentence = row[0]["input_sentence"]
310
+ source_sentence = row[0]["matched_sentence"]
311
 
312
+ label = row[0]["label"]
313
+ score = row[0]["similarity"]
314
 
315
+ url = row[0]["url"] #
316
  short_url = self.shorten_url(url, max_length)
317
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
318
 
 
325
  </tr>
326
  """
327
 
328
+ def format_image_fact_checker_row(self, max_length=30):
 
329
 
330
  if self.image_referent_url is not None or self.image_referent_url != "":
331
  source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
 
336
  source_image_url = ""
337
 
338
  return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
339
+
340
+
341
+ def create_ordinary_user_table(self):
342
+ rows = []
343
+ max_length = 30 # TODO: put this in configuration
344
+ rows.append(self.format_image_ordinary_user_row(max_length))
345
+ rows.append(self.format_text_ordinary_user_row(max_length))
346
+ table = "\n".join(rows)
347
+
348
+ return f"""
349
+ <h5>Comparison between input news and source news</h5>
350
+ <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
351
+ <thead>
352
+ <tr>
353
+ <th>Input news</th>
354
+ <th>Forensic</th>
355
+ <th>Originality</th>
356
+ </tr>
357
+ </thead>
358
+ <tbody>
359
+ {table}
360
+ </tbody>
361
+ </table>
362
+
363
+ <style>
364
+ """
365
+
366
+ def format_text_ordinary_user_row(self, max_length=30):
367
+ input_sentences = ""
368
+ source_text_urls = ""
369
+ label = ""
370
+ scores = 0
371
+ sentence_count = 0
372
+ for index, row in enumerate(self.aligned_sentences):
373
+ if row["input_sentence"] == "":
374
+ continue
375
+ input_sentences += row["input_sentence"]
376
+ label = self.aligned_sentences[index]["label"]
377
+ if label == "HUMAN":
378
+ score = self.aligned_sentences[index]["similarity"]
379
+ if label == "MACHINE":
380
+ score = 1 - self.aligned_sentences[index]["similarity"]
381
+ scores += score
382
+
383
+ url = self.aligned_sentences[index]["url"] #
384
+ short_url = self.shorten_url(url, max_length)
385
+ source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
386
+ sentence_count += 1
387
+
388
+ if scores == 0:
389
+ label = "UNKNOWN"
390
+ else:
391
+ scores /= sentence_count
392
+ if scores > 0.5:
393
+ label = "HUMAN"
394
+ else:
395
+ label = "MACHINE"
396
+ scores = 1 - scores
397
+
398
+ return f"""
399
+ <tr>
400
+ <td>{input_sentences}</td>
401
+ <td>{label}<br>({scores*100:.2f}%)</td>
402
+ <td>{source_text_urls}</td>
403
+ </tr>
404
+ """
405
+
406
+ def format_image_ordinary_user_row(self, max_length=30):
407
+
408
+ if self.image_referent_url is not None or self.image_referent_url != "":
409
+ source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
410
+ short_url = self.shorten_url(self.image_referent_url, max_length)
411
+ source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
412
+ else:
413
+ source_image = "Image not found"
414
+ source_image_url = ""
415
+
416
+ return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
417
+
418
+
419
+ def create_governor_table(self):
420
+ rows = []
421
+ max_length = 30 # TODO: put this in configuration
422
+ rows.append(self.format_image_governor_row(max_length))
423
+
424
+ for aligned_sentence in self.aligned_sentences:
425
+ if "input_sentence" not in aligned_sentence:
426
+ continue
427
+
428
+ # Get index of equal phrases in input and source sentences
429
+ equal_idx_1, equal_idx_2 = extract_equal_text(
430
+ aligned_sentence["input_sentence"],
431
+ aligned_sentence["matched_sentence"],
432
+ )
433
+
434
+ # Get entity-words (in pair) with colors
435
+ entities_with_colors = highlight_entities(
436
+ aligned_sentence["input_sentence"],
437
+ aligned_sentence["matched_sentence"],
438
+ )
439
+
440
+ self.governor_table.append(
441
+ [
442
+ aligned_sentence,
443
+ equal_idx_1,
444
+ equal_idx_2,
445
+ entities_with_colors,
446
+ ]
447
+ )
448
+
449
+ formatted_row = self.format_text_governor_row(max_length)
450
+ rows.append(formatted_row)
451
+
452
+ table = "\n".join(rows)
453
+ return f"""
454
+ <h5>Comparison between input news and source news</h5>
455
+ <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
456
+ <col style="width: 150px;"> <col style="width: 150px;"> <col style="width: 50px;"> <col style="width: 75px;">
457
+ <thead>
458
+ <tr>
459
+ <th>Input news</th>
460
+ <th>Source (URL provided in Originality column correspondingly)</th>
461
+ <th>Forensic</th>
462
+ <th>Originality</th>
463
+ </tr>
464
+ </thead>
465
+ <tbody>
466
+ {table}
467
+ </tbody>
468
+ </table>
469
+
470
+ <style>
471
+ """
472
+
473
+ def format_text_governor_row(self, max_length=30):
474
+ input_sentences = ""
475
+ source_sentences = ""
476
+ source_text_urls = ""
477
+ label = ""
478
+ scores = 0
479
+ sentence_count = 0
480
+ entity_count = 0
481
+ for row in self.governor_table:
482
+ print(f"governor_row: {row}")
483
+ if row[0]["input_sentence"] == "":
484
+ continue
485
+
486
+ if row[0]["matched_sentence"] != "": # source is not empty
487
+ # highlight entities
488
+ input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input", entity_count)
489
+ source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source", entity_count)
490
+ entity_count += len(row[3])
491
+
492
+ # Color overlapping words
493
+ input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
494
+ source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source) # text, index of highlight words
495
+
496
+ input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
497
+ source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
498
+
499
+ else:
500
+ input_sentence = row[0]["input_sentence"]
501
+ source_sentence = row[0]["matched_sentence"]
502
+
503
+ input_sentences += input_sentence
504
+ source_sentences += source_sentence
505
+ score = row[0]["similarity"]
506
+ label = row[0]["label"]
507
+ if label == "HUMAN":
508
+ score = row[0]["similarity"]
509
+ if label == "MACHINE":
510
+ score = 1 - row[0]["similarity"]
511
+ scores += score
512
+
513
+ url = row[0]["url"]
514
+ short_url = self.shorten_url(url, max_length)
515
+ source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
516
+ sentence_count += 1
517
+
518
+ if scores == 0:
519
+ label = "UNKNOWN"
520
+ else:
521
+ scores /= sentence_count
522
+ if scores > 0.5:
523
+ label = "HUMAN"
524
+ else:
525
+ label = "MACHINE"
526
+ scores = 1 - scores
527
+
528
+ return f"""
529
+ <tr>
530
+ <td>{input_sentences}</td>
531
+ <td>{source_sentences}</td>
532
+ <td>{label}<br>({score*100:.2f}%)</td>
533
+ <td>{source_text_urls}</td>
534
+ </tr>
535
+ """
536
+
537
+ def format_image_governor_row(self, max_length=30):
538
+ if self.image_referent_url is not None or self.image_referent_url != "":
539
+ source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
540
+ short_url = self.shorten_url(self.image_referent_url, max_length)
541
+ source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
542
+ else:
543
+ source_image = "Image not found"
544
+ source_image_url = ""
545
+
546
+ return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
547
+
548
+
549
  def shorten_url(self, url, max_length=30):
550
  if url is None:
551
  return ""
 
556
  short_url = url
557
  return short_url
558
 
559
+
560
  def color_text(self, text, colored_idx, highlighted_idx):
561
  paragraph = ""
562
  words = text.split()
563
 
564
  starts, ends = self.extract_starts_ends(colored_idx)
565
  starts, ends = self.filter_indices(starts, ends, highlighted_idx)
566
+
 
 
567
  previous_end = 0
568
  for start, end in zip(starts, ends):
569
  paragraph += " ".join(words[previous_end:start])
570
 
571
  equal_words = " ".join(words[start:end])
 
 
 
572
  paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
573
 
574
  previous_end = end
src/application/text/entity.py CHANGED
@@ -15,27 +15,28 @@ AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
15
  AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
16
 
17
  client = openai.AzureOpenAI(
18
- api_version = AZURE_OPENAI_API_VERSION,
19
  api_key = AZURE_OPENAI_API_KEY,
20
  azure_endpoint = AZURE_OPENAI_ENDPOINT,
21
  )
22
 
23
 
24
- def extract_entities_gpt(original_text, compared_text, text_generation_model="gpt-4o-mini"):
25
- # "o1-mini-2024-09-12"
26
  # Generate text using the selected models
27
  prompt = f"""
28
  Compare the ORIGINAL TEXT and the COMPARED TEXT.
29
  Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
30
  Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
31
  * **Numerical changes:** e.g., "five" changed to "ten," "10%" changed to "50%"
 
32
  * **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
33
  * **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
34
  * **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
35
 
36
  Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
37
 
38
- Output the extracted entity pairs, one pair per line, in the following JSON-like list format:
39
  [
40
  ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
41
  ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
@@ -55,7 +56,7 @@ If there are no entities that satisfy above condition, output empty list "[]".
55
  try:
56
  response = client.chat.completions.create(
57
  model=text_generation_model,
58
- messages = [{"role": "system", "content": prompt}],
59
  )
60
 
61
  res = response.choices[0].message.content
@@ -69,7 +70,13 @@ If there are no entities that satisfy above condition, output empty list "[]".
69
  def read_json(json_string) -> list[list[str]]:
70
  try:
71
  entities = json.loads(json_string)
72
- return entities
 
 
 
 
 
 
73
 
74
  except json.JSONDecodeError as e:
75
  print(f"Error decoding JSON: {e}")
@@ -132,26 +139,22 @@ def assign_colors_to_entities(entities):
132
  def highlight_entities(text1, text2):
133
  if text1 == "" or text2 == "":
134
  return []
135
-
136
- print(f"text1: {text1}")
137
- print(f"text2: {text2}")
138
  entities_text = extract_entities_gpt(text1, text2)
139
  print(f"entities_text: {entities_text}")
140
 
 
 
 
141
  entities = read_json(entities_text)
142
-
143
  # Assign colors to entities
144
  entities_with_colors = assign_colors_to_entities(entities)
145
- print(f"entities_colors: ", entities_with_colors)
146
-
147
- # Apply highlighting to entities
148
- # highlighted_text_1 = apply_highlight(text1, entities_with_colors, "input")
149
- # highlighted_text_2 = apply_highlight(text2, entities_with_colors, "source")
150
-
151
  return entities_with_colors
152
 
153
 
154
- def apply_highlight(text, entities_with_colors, key="input"):
155
  if entities_with_colors == []:
156
  return text, []
157
 
@@ -182,7 +185,7 @@ def apply_highlight(text, entities_with_colors, key="input"):
182
  # Style the index as a label
183
  index_label = (f'<span_style="background-color:{label_color};color:white;'
184
  f'padding:1px_4px;border-radius:4px;font-size:12px;'
185
- f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1}</span>')
186
 
187
  # Append highlighted text with index label
188
  highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'
 
15
  AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
16
 
17
  client = openai.AzureOpenAI(
18
+ api_version = "2024-05-01-preview", # AZURE_OPENAI_API_VERSION,
19
  api_key = AZURE_OPENAI_API_KEY,
20
  azure_endpoint = AZURE_OPENAI_ENDPOINT,
21
  )
22
 
23
 
24
+ def extract_entities_gpt(original_text, compared_text, text_generation_model="o1-mini"):
25
+ # "gpt-4o-mini" or "o1-mini"
26
  # Generate text using the selected models
27
  prompt = f"""
28
  Compare the ORIGINAL TEXT and the COMPARED TEXT.
29
  Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
30
  Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
31
  * **Numerical changes:** e.g., "five" changed to "ten," "10%" changed to "50%"
32
+ * **Time changes:** e.g., "Monday" changed to "Sunday," "10th" changed to "21st"
33
  * **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
34
  * **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
35
  * **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
36
 
37
  Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
38
 
39
+ Output the extracted entity pairs, one pair per line, in the following JSON-like list format without wrapping characters:
40
  [
41
  ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
42
  ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
 
56
  try:
57
  response = client.chat.completions.create(
58
  model=text_generation_model,
59
+ messages = [{"role": "user", "content": prompt}],
60
  )
61
 
62
  res = response.choices[0].message.content
 
70
  def read_json(json_string) -> list[list[str]]:
71
  try:
72
  entities = json.loads(json_string)
73
+ # Remove duplicates pair of entities
74
+ unique_entities = []
75
+ for inner_list in entities:
76
+ if inner_list not in unique_entities:
77
+ unique_entities.append(inner_list)
78
+
79
+ return unique_entities
80
 
81
  except json.JSONDecodeError as e:
82
  print(f"Error decoding JSON: {e}")
 
139
  def highlight_entities(text1, text2):
140
  if text1 == "" or text2 == "":
141
  return []
142
+
 
 
143
  entities_text = extract_entities_gpt(text1, text2)
144
  print(f"entities_text: {entities_text}")
145
 
146
+ # Clean up entities: remove wrapping characters
147
+ entities_text = entities_text.replace("```json", "").replace("```", "")
148
+
149
  entities = read_json(entities_text)
150
+
151
  # Assign colors to entities
152
  entities_with_colors = assign_colors_to_entities(entities)
153
+
 
 
 
 
 
154
  return entities_with_colors
155
 
156
 
157
+ def apply_highlight(text, entities_with_colors, key="input", count = 0):
158
  if entities_with_colors == []:
159
  return text, []
160
 
 
185
  # Style the index as a label
186
  index_label = (f'<span_style="background-color:{label_color};color:white;'
187
  f'padding:1px_4px;border-radius:4px;font-size:12px;'
188
+ f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>')
189
 
190
  # Append highlighted text with index label
191
  highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'
src/application/text/helper.py CHANGED
@@ -1,4 +1,5 @@
1
  from collections import Counter
 
2
  import re
3
  import string
4
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -144,6 +145,30 @@ def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length
144
 
145
  return important_phrases
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  def connect_consecutive_indexes(nums):
148
  """
149
  Connects consecutive integers in a list.
 
1
  from collections import Counter
2
+ from difflib import SequenceMatcher
3
  import re
4
  import string
5
  from sklearn.feature_extraction.text import TfidfVectorizer
 
145
 
146
  return important_phrases
147
 
148
+ def extract_equal_text(text1, text2):
149
+ def cleanup(text):
150
+ text = text.lower()
151
+ text = text.translate(str.maketrans('', '', string.punctuation))
152
+ return text
153
+
154
+ splited_text1 = cleanup(text1).split()
155
+ splited_text2 = cleanup(text2).split()
156
+
157
+ s = SequenceMatcher(None, splited_text1, splited_text2)
158
+
159
+ equal_idx_1 = []
160
+ equal_idx_2 = []
161
+ text1 = text1.split()
162
+ text2 = text2.split()
163
+ for tag, i1, i2, j1, j2 in s.get_opcodes():
164
+ if tag == 'equal':
165
+ equal_idx_1.append({"start": i1, "end": i2})
166
+ equal_idx_2.append({"start": j1, "end": j2})
167
+ # subtext_1 = " ".join(text1[i1:i2])
168
+ # subtext_2 = " ".join(text2[j1:j2])
169
+ # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
170
+ return equal_idx_1, equal_idx_2
171
+
172
  def connect_consecutive_indexes(nums):
173
  """
174
  Connects consecutive integers in a list.
src/application/text/preprocessing.py CHANGED
@@ -1,6 +1,6 @@
1
  from nltk.tokenize import sent_tokenize
2
 
3
- def split_into_sentences(input_text):
4
  """
5
  Splits input text into sentences by newlines.
6
 
 
1
  from nltk.tokenize import sent_tokenize
2
 
3
+ def split_into_paragraphs(input_text):
4
  """
5
  Splits input text into sentences by newlines.
6
 
src/application/text/search_detection.py CHANGED
@@ -1,9 +1,11 @@
 
1
  import warnings
2
  warnings.simplefilter(action='ignore', category=FutureWarning)
3
 
4
- from src.application.text.preprocessing import split_into_sentences
5
  from src.application.text.search import generate_search_phrases, search_by_google
6
  from src.application.url_reader import URLReader
 
7
  import numpy as np
8
  import nltk
9
  import torch
@@ -41,7 +43,7 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
41
  search_results = search_by_google(candidate)
42
  urls = [item['link'] for item in search_results.get("items", [])]
43
 
44
- for url in urls[:10]:
45
  if url in checked_urls: # visited url
46
  continue
47
  if "bbc.com" not in url:
@@ -196,14 +198,13 @@ def check_paraphrase(input_text, page_text, url):
196
  return False, []
197
 
198
  # Extract sentences from input text and web page
199
- #input_text = remove_punctuation(input_text)
200
- input_sentences = split_into_sentences(input_text)
201
 
202
-
203
  if not page_text:
204
  return is_paraphrase_text, []
205
- #page_text = remove_punctuation(page_text)
206
- page_sentences = split_into_sentences(page_text)
207
  if not input_sentences or not page_sentences:
208
  return is_paraphrase_text, []
209
 
@@ -213,8 +214,6 @@ def check_paraphrase(input_text, page_text, url):
213
  additional_sentences.append(sentence.replace(", external", ""))
214
  page_sentences.extend(additional_sentences)
215
 
216
- # min_matching_sentences = math.ceil(len(input_sentences) * MIN_RATIO_PARAPHRASE_NUM)
217
-
218
  # Encode sentences into embeddings
219
  embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
220
  embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
@@ -225,10 +224,12 @@ def check_paraphrase(input_text, page_text, url):
225
  # Find sentence alignments
226
  alignment = {}
227
  paraphrased_sentence_count = 0
 
228
  for i, sentence1 in enumerate(input_sentences):
229
  max_sim_index = np.argmax(similarity_matrix[i])
230
  max_similarity = similarity_matrix[i][max_sim_index]
231
-
 
232
  is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
233
 
234
  if is_paraphrase_sentence is False:
@@ -262,9 +263,20 @@ def check_paraphrase(input_text, page_text, url):
262
  # Check if enough sentences are paraphrases
263
 
264
  is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
265
-
 
 
 
 
 
 
 
 
 
 
266
  return is_paraphrase_text, alignment
267
 
 
268
  def similarity_ratio(a, b):
269
  """
270
  Calculates the similarity ratio between two strings using SequenceMatcher.
 
1
+ import string
2
  import warnings
3
  warnings.simplefilter(action='ignore', category=FutureWarning)
4
 
5
+ from src.application.text.preprocessing import split_into_paragraphs
6
  from src.application.text.search import generate_search_phrases, search_by_google
7
  from src.application.url_reader import URLReader
8
+ from src.application.text.helper import extract_equal_text
9
  import numpy as np
10
  import nltk
11
  import torch
 
43
  search_results = search_by_google(candidate)
44
  urls = [item['link'] for item in search_results.get("items", [])]
45
 
46
+ for url in urls[:3]:
47
  if url in checked_urls: # visited url
48
  continue
49
  if "bbc.com" not in url:
 
198
  return False, []
199
 
200
  # Extract sentences from input text and web page
201
+ # input_sentences = split_into_paragraphs(input_text)
202
+ input_sentences = [input_text]
203
 
 
204
  if not page_text:
205
  return is_paraphrase_text, []
206
+
207
+ page_sentences = split_into_paragraphs(page_text)
208
  if not input_sentences or not page_sentences:
209
  return is_paraphrase_text, []
210
 
 
214
  additional_sentences.append(sentence.replace(", external", ""))
215
  page_sentences.extend(additional_sentences)
216
 
 
 
217
  # Encode sentences into embeddings
218
  embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
219
  embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
 
224
  # Find sentence alignments
225
  alignment = {}
226
  paraphrased_sentence_count = 0
227
+ best_matched_sentence = ""
228
  for i, sentence1 in enumerate(input_sentences):
229
  max_sim_index = np.argmax(similarity_matrix[i])
230
  max_similarity = similarity_matrix[i][max_sim_index]
231
+
232
+ best_matched_sentence = page_sentences[max_sim_index]
233
  is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
234
 
235
  if is_paraphrase_sentence is False:
 
263
  # Check if enough sentences are paraphrases
264
 
265
  is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
266
+
267
+ # Method 2: Check if overlapped words between sentences are more than 50%
268
+ equal_idx_1, _ = extract_equal_text(input_sentences[0], best_matched_sentence)
269
+ matched_count = 0
270
+ for index in equal_idx_1:
271
+ matched_count += index["end"] - index["start"]
272
+ sent = input_sentences[0].translate(str.maketrans('', '', string.punctuation))
273
+ num_words = len(sent.split())
274
+ if matched_count > num_words / 2:
275
+ is_paraphrase_text = True
276
+
277
  return is_paraphrase_text, alignment
278
 
279
+
280
  def similarity_ratio(a, b):
281
  """
282
  Calculates the similarity ratio between two strings using SequenceMatcher.
test.py CHANGED
@@ -1,46 +1,27 @@
1
- from difflib import SequenceMatcher
2
- import string
3
 
4
- def extract_equal_text(text1, text2):
5
- def cleanup(text):
6
- text = text.lower()
7
- text = text.translate(str.maketrans('', '', string.punctuation))
8
- return text
9
-
10
- splited_text1 = cleanup(text1).split()
11
- splited_text2 = cleanup(text2).split()
12
-
13
- s = SequenceMatcher(None, splited_text1, splited_text2)
14
-
15
- equal_idx_1 = []
16
- equal_idx_2 = []
17
- text1 = text1.split()
18
- text2 = text2.split()
19
- for tag, i1, i2, j1, j2 in s.get_opcodes():
20
- if tag == 'equal':
21
- equal_idx_1.append({"start": i1, "end": i2})
22
- equal_idx_2.append({"start": j1, "end": j2})
23
- subtext_1 = " ".join(text1[i1:i2])
24
- subtext_2 = " ".join(text2[j1:j2])
25
- print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] {subtext_1!r:>55} --> {subtext_2!r}')
26
-
27
- return equal_idx_1, equal_idx_2
28
-
29
- text1 = """
30
- Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m.
31
- Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
32
- He made a substitute appearance and waved farewell to fans in Newcastle's recent win against Southampton.
33
- Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2022-23, and scored against Paris St-Germain in the Champions League.
34
  """
35
- text2 = """
36
- Newcastle United winger Miguel Almiron has rejoined Atlanta United on a permanent deal for £8m.
37
- Almiron has made 223 appearances for Newcastle, scoring 30 goals, but has struggled recently to gain a place in manager Eddie Howe's starting line-up.
38
- Last weekend he came on as a substitute in Newcastle's 3-1 win against Southampton and waved farewell to the travelling supporters.
39
- Almiron played a significant role in Newcastle reaching the Carabao Cup final and finishing fourth in the Premier League in 2022-23.
40
- """
41
-
42
- idx_1, idx_2 = extract_equal_text(text1, text2)
 
 
43
 
44
- # text1_split = text1.split()
45
- # for idx in idx_1:
46
- # print(text1_split[idx["start"]:idx["end"]])
 
 
1
+ import json
 
2
 
3
+ text = """```json
4
+ [
5
+ ["Sunday", "Thursday"],
6
+ ["two millions", "one million"],
7
+ ["north", "east"],
8
+ ["Japan", "UK"],
9
+ ["Sunday", "Thursday"]
10
+ ]
11
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  """
13
+ def read_json(json_string) -> list[list[str]]:
14
+ try:
15
+ entities = json.loads(json_string)
16
+ # Remove duplicates pair of entities
17
+ unique_data = []
18
+ for inner_list in entities:
19
+ if inner_list not in unique_data:
20
+ unique_data.append(inner_list)
21
+
22
+ return unique_data
23
 
24
+ except json.JSONDecodeError as e:
25
+ print(f"Error decoding JSON: {e}")
26
+ return []
27
+ print(read_json(text.replace("```json", "").replace("```", "")))