Spaces:
Running
Running
Commit
·
26e3944
1
Parent(s):
56cf7e3
add 2 more screen for ordinary and governor users
Browse files- application_2.py +49 -12
- examples/example_text_LLM_modification.txt +3 -1
- src/application/content_detection.py +271 -93
- src/application/text/entity.py +21 -18
- src/application/text/helper.py +25 -0
- src/application/text/preprocessing.py +1 -1
- src/application/text/search_detection.py +23 -11
- test.py +24 -43
application_2.py
CHANGED
@@ -8,10 +8,6 @@ from src.application.content_detection import NewsVerification
|
|
8 |
from src.application.url_reader import URLReader
|
9 |
from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
|
10 |
|
11 |
-
|
12 |
-
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
13 |
-
SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
|
14 |
-
|
15 |
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
16 |
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
17 |
|
@@ -52,7 +48,7 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
|
|
52 |
# Define the GUI
|
53 |
with gr.Blocks() as demo:
|
54 |
gr.Markdown("# NEWS VERIFICATION")
|
55 |
-
|
56 |
with gr.Row():
|
57 |
# SETTINGS
|
58 |
with gr.Column(scale=1):
|
@@ -88,15 +84,56 @@ with gr.Blocks() as demo:
|
|
88 |
news_content = gr.Textbox(label="Content", value="", lines=13)
|
89 |
|
90 |
# NEWS ANALYSIS REPORT
|
91 |
-
|
|
|
92 |
- Green texts are the matched words in the input and source news.<br>
|
93 |
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
94 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
with gr.Column(scale=2):
|
96 |
-
with gr.Accordion("
|
97 |
-
gr.
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
# Connect events
|
102 |
load_button.click(
|
@@ -113,9 +150,9 @@ with gr.Blocks() as demo:
|
|
113 |
generate_image_button.click(generate_fake_image,
|
114 |
inputs=[image_generation_model, news_title],
|
115 |
outputs=[news_image])
|
116 |
-
|
117 |
inputs=[news_title, news_content, news_image],
|
118 |
-
outputs=[
|
119 |
|
120 |
# change Image
|
121 |
#url_input.change(load_image, inputs=url_input, outputs=image_view)
|
|
|
8 |
from src.application.url_reader import URLReader
|
9 |
from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
|
10 |
|
|
|
|
|
|
|
|
|
11 |
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
12 |
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
13 |
|
|
|
48 |
# Define the GUI
|
49 |
with gr.Blocks() as demo:
|
50 |
gr.Markdown("# NEWS VERIFICATION")
|
51 |
+
|
52 |
with gr.Row():
|
53 |
# SETTINGS
|
54 |
with gr.Column(scale=1):
|
|
|
84 |
news_content = gr.Textbox(label="Content", value="", lines=13)
|
85 |
|
86 |
# NEWS ANALYSIS REPORT
|
87 |
+
ordinary_user_explanation = """
|
88 |
+
FOR ORDINARY USER<br>
|
89 |
- Green texts are the matched words in the input and source news.<br>
|
90 |
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
91 |
"""
|
92 |
+
fact_checker_explanation = """
|
93 |
+
FOR FACT CHECKER<br>
|
94 |
+
- Green texts are the matched words in the input and source news.<br>
|
95 |
+
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
96 |
+
"""
|
97 |
+
governor_explanation = """
|
98 |
+
FOR GOVERNOR<br>
|
99 |
+
- Green texts are the matched words in the input and source news.<br>
|
100 |
+
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
101 |
+
"""
|
102 |
+
table = """
|
103 |
+
<h5>Comparison between input news and source news</h5>
|
104 |
+
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
105 |
+
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
106 |
+
<thead>
|
107 |
+
<tr>
|
108 |
+
<th>Input news</th>
|
109 |
+
<th>Source (URL provided in Originality column correspondingly)</th>
|
110 |
+
<th>Forensic</th>
|
111 |
+
<th>Originality</th>
|
112 |
+
</tr>
|
113 |
+
</thead>
|
114 |
+
<tbody>
|
115 |
+
<tr>
|
116 |
+
<th>Input 1</th>
|
117 |
+
<th>Source 1(URL provided in Originality column correspondingly)</th>
|
118 |
+
<th>Forensic 1</th>
|
119 |
+
<th>Originality 1</th>
|
120 |
+
</tr>
|
121 |
+
</tbody>
|
122 |
+
</table>
|
123 |
+
|
124 |
+
<style>"""
|
125 |
with gr.Column(scale=2):
|
126 |
+
with gr.Accordion("NEWS ANALYSIS"):
|
127 |
+
verification_button = gr.Button("Verify news")
|
128 |
+
with gr.Tab("Orinary User"):
|
129 |
+
gr.HTML(ordinary_user_explanation)
|
130 |
+
ordinary_user_result = gr.HTML(table)
|
131 |
+
with gr.Tab("Fact Checker"):
|
132 |
+
gr.HTML(fact_checker_explanation)
|
133 |
+
fact_checker_result = gr.HTML("<br>"*40)
|
134 |
+
with gr.Tab("Governor"):
|
135 |
+
gr.HTML(fact_checker_explanation)
|
136 |
+
governor_result = gr.HTML(table)
|
137 |
|
138 |
# Connect events
|
139 |
load_button.click(
|
|
|
150 |
generate_image_button.click(generate_fake_image,
|
151 |
inputs=[image_generation_model, news_title],
|
152 |
outputs=[news_image])
|
153 |
+
verification_button.click(generate_analysis_report,
|
154 |
inputs=[news_title, news_content, news_image],
|
155 |
+
outputs=[ordinary_user_result, fact_checker_result, governor_result])
|
156 |
|
157 |
# change Image
|
158 |
#url_input.change(load_image, inputs=url_input, outputs=image_view)
|
examples/example_text_LLM_modification.txt
CHANGED
@@ -1 +1,3 @@
|
|
1 |
-
Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
|
|
|
|
|
|
1 |
+
Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
|
2 |
+
He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
|
3 |
+
Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.
|
src/application/content_detection.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
from difflib import SequenceMatcher
|
2 |
-
import difflib
|
3 |
-
import string
|
4 |
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
|
5 |
from src.application.text.entity import apply_highlight, highlight_entities
|
|
|
6 |
from src.application.text.model_detection import detect_text_by_ai_model
|
7 |
-
from src.application.text.preprocessing import
|
8 |
from src.application.text.search_detection import check_human, detect_text_by_relative_search
|
9 |
|
10 |
|
@@ -27,7 +26,10 @@ class NewsVerification():
|
|
27 |
self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
|
28 |
self.aligned_sentences:list[dict] = []
|
29 |
self.is_paraphrased:list[bool] = []
|
30 |
-
|
|
|
|
|
|
|
31 |
|
32 |
def load_news(self, news_title, news_content, news_image):
|
33 |
self.news_text = news_title + "\n\n" + news_content
|
@@ -50,7 +52,7 @@ class NewsVerification():
|
|
50 |
print("CHECK TEXT:")
|
51 |
print("\tFrom search engine:")
|
52 |
# Classify by search engine
|
53 |
-
input_sentences =
|
54 |
current_index = 0
|
55 |
previous_paraphrase = None
|
56 |
ai_sentence = {
|
@@ -87,7 +89,7 @@ class NewsVerification():
|
|
87 |
else:
|
88 |
if previous_paraphrase is False or previous_paraphrase is None:
|
89 |
# add ai_sentences to align_sentences
|
90 |
-
if ai_sentence["input_sentence"] != "":
|
91 |
text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
|
92 |
ai_sentence["label"] = text_prediction_label
|
93 |
ai_sentence["similarity"] = text_prediction_score
|
@@ -102,7 +104,7 @@ class NewsVerification():
|
|
102 |
"paraphrase": False,
|
103 |
"url": "",
|
104 |
}
|
105 |
-
|
106 |
# add searched_sentences to align_sentences
|
107 |
if searched_sentences["input_sentence"] != "":
|
108 |
self.found_img_url.extend(img_urls)
|
@@ -181,63 +183,11 @@ class NewsVerification():
|
|
181 |
self.detect_image_origin()
|
182 |
|
183 |
def analyze_details(self):
|
184 |
-
|
|
|
|
|
185 |
|
186 |
-
|
187 |
-
if "input_sentence" not in aligned_sentence:
|
188 |
-
continue
|
189 |
-
|
190 |
-
# Get index of equal phrases in input and source sentences
|
191 |
-
equal_idx_1, equal_idx_2 = self.extract_equal_text(
|
192 |
-
aligned_sentence["input_sentence"],
|
193 |
-
aligned_sentence["matched_sentence"],
|
194 |
-
)
|
195 |
-
|
196 |
-
# Get entity-words (in pair) with colors
|
197 |
-
entities_with_colors = highlight_entities(
|
198 |
-
aligned_sentence["input_sentence"],
|
199 |
-
aligned_sentence["matched_sentence"],
|
200 |
-
)
|
201 |
-
|
202 |
-
self.analyzed_table.append(
|
203 |
-
[
|
204 |
-
aligned_sentence["input_sentence"],
|
205 |
-
aligned_sentence["matched_sentence"],
|
206 |
-
equal_idx_1,
|
207 |
-
equal_idx_2,
|
208 |
-
entities_with_colors,
|
209 |
-
]
|
210 |
-
)
|
211 |
-
|
212 |
-
if len(self.analyzed_table) != 0:
|
213 |
-
html_table = self.create_table()
|
214 |
-
else:
|
215 |
-
html_table = ""
|
216 |
-
return html_table
|
217 |
-
|
218 |
-
def extract_equal_text(self, text1, text2):
|
219 |
-
def cleanup(text):
|
220 |
-
text = text.lower()
|
221 |
-
text = text.translate(str.maketrans('', '', string.punctuation))
|
222 |
-
return text
|
223 |
-
|
224 |
-
splited_text1 = cleanup(text1).split()
|
225 |
-
splited_text2 = cleanup(text2).split()
|
226 |
-
|
227 |
-
s = SequenceMatcher(None, splited_text1, splited_text2)
|
228 |
-
|
229 |
-
equal_idx_1 = []
|
230 |
-
equal_idx_2 = []
|
231 |
-
text1 = text1.split()
|
232 |
-
text2 = text2.split()
|
233 |
-
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
234 |
-
if tag == 'equal':
|
235 |
-
equal_idx_1.append({"start": i1, "end": i2})
|
236 |
-
equal_idx_2.append({"start": j1, "end": j2})
|
237 |
-
# subtext_1 = " ".join(text1[i1:i2])
|
238 |
-
# subtext_2 = " ".join(text2[j1:j2])
|
239 |
-
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
|
240 |
-
return equal_idx_1, equal_idx_2
|
241 |
|
242 |
def get_text_urls(self):
|
243 |
return set(self.text_referent_url)
|
@@ -264,7 +214,7 @@ class NewsVerification():
|
|
264 |
if not sentence_1 or not sentence_2: # Handle empty strings
|
265 |
return []
|
266 |
|
267 |
-
s =
|
268 |
common_phrases = []
|
269 |
|
270 |
for block in s.get_matching_blocks():
|
@@ -287,16 +237,40 @@ class NewsVerification():
|
|
287 |
position += len(sentence_1)
|
288 |
return common_phrases, position
|
289 |
|
290 |
-
def
|
291 |
-
#table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
|
292 |
-
# loop of self.analyzed_table with index:
|
293 |
rows = []
|
294 |
max_length = 30 # TODO: put this in configuration
|
295 |
-
rows.append(self.
|
296 |
|
297 |
-
for
|
298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
rows.append(formatted_row)
|
|
|
300 |
table = "\n".join(rows)
|
301 |
return f"""
|
302 |
<h5>Comparison between input news and source news</h5>
|
@@ -316,29 +290,29 @@ class NewsVerification():
|
|
316 |
|
317 |
<style>
|
318 |
"""
|
319 |
-
|
320 |
-
def
|
321 |
-
if row[
|
|
|
|
|
322 |
# highlight entities
|
323 |
-
input_sentence, highlight_idx_input = apply_highlight(row[0], row[
|
324 |
-
source_sentence, highlight_idx_source = apply_highlight(row[
|
325 |
-
print(f"highlighted_input: {input_sentence}")
|
326 |
|
327 |
# Color overlapping words
|
328 |
-
input_sentence = self.color_text(input_sentence, row[
|
329 |
-
source_sentence = self.color_text(source_sentence, row[
|
330 |
-
print(f"input_sentence: {input_sentence}")
|
331 |
|
332 |
input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
|
333 |
source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
|
334 |
else:
|
335 |
-
input_sentence = row[0]
|
336 |
-
source_sentence = row[
|
337 |
|
338 |
-
label =
|
339 |
-
score =
|
340 |
|
341 |
-
url =
|
342 |
short_url = self.shorten_url(url, max_length)
|
343 |
source_text_url = f"""<a href="{url}">{short_url}</a>"""
|
344 |
|
@@ -351,8 +325,7 @@ class NewsVerification():
|
|
351 |
</tr>
|
352 |
"""
|
353 |
|
354 |
-
def
|
355 |
-
# input_image = f"""<img src="example_image_input.jpg" width="200" height="150">"""
|
356 |
|
357 |
if self.image_referent_url is not None or self.image_referent_url != "":
|
358 |
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
|
@@ -363,7 +336,216 @@ class NewsVerification():
|
|
363 |
source_image_url = ""
|
364 |
|
365 |
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
def shorten_url(self, url, max_length=30):
|
368 |
if url is None:
|
369 |
return ""
|
@@ -374,23 +556,19 @@ class NewsVerification():
|
|
374 |
short_url = url
|
375 |
return short_url
|
376 |
|
|
|
377 |
def color_text(self, text, colored_idx, highlighted_idx):
|
378 |
paragraph = ""
|
379 |
words = text.split()
|
380 |
|
381 |
starts, ends = self.extract_starts_ends(colored_idx)
|
382 |
starts, ends = self.filter_indices(starts, ends, highlighted_idx)
|
383 |
-
|
384 |
-
print(f"starts_2: {starts}")
|
385 |
-
print(f"ends_2: {ends}")
|
386 |
previous_end = 0
|
387 |
for start, end in zip(starts, ends):
|
388 |
paragraph += " ".join(words[previous_end:start])
|
389 |
|
390 |
equal_words = " ".join(words[start:end])
|
391 |
-
print(f"starts_2: {start}")
|
392 |
-
print(f"ends_2: {end}")
|
393 |
-
print(f"equal_words: {words[start:end]}")
|
394 |
paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
395 |
|
396 |
previous_end = end
|
|
|
1 |
from difflib import SequenceMatcher
|
|
|
|
|
2 |
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
|
3 |
from src.application.text.entity import apply_highlight, highlight_entities
|
4 |
+
from src.application.text.helper import extract_equal_text
|
5 |
from src.application.text.model_detection import detect_text_by_ai_model
|
6 |
+
from src.application.text.preprocessing import split_into_paragraphs
|
7 |
from src.application.text.search_detection import check_human, detect_text_by_relative_search
|
8 |
|
9 |
|
|
|
26 |
self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
|
27 |
self.aligned_sentences:list[dict] = []
|
28 |
self.is_paraphrased:list[bool] = []
|
29 |
+
|
30 |
+
self.ordinary_user_table:list = []
|
31 |
+
self.fact_checker_table:list = []
|
32 |
+
self.governor_table:list = []
|
33 |
|
34 |
def load_news(self, news_title, news_content, news_image):
|
35 |
self.news_text = news_title + "\n\n" + news_content
|
|
|
52 |
print("CHECK TEXT:")
|
53 |
print("\tFrom search engine:")
|
54 |
# Classify by search engine
|
55 |
+
input_sentences = split_into_paragraphs(self.news_text)
|
56 |
current_index = 0
|
57 |
previous_paraphrase = None
|
58 |
ai_sentence = {
|
|
|
89 |
else:
|
90 |
if previous_paraphrase is False or previous_paraphrase is None:
|
91 |
# add ai_sentences to align_sentences
|
92 |
+
if ai_sentence["input_sentence"] != "" or current_index >= len(input_sentences):
|
93 |
text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
|
94 |
ai_sentence["label"] = text_prediction_label
|
95 |
ai_sentence["similarity"] = text_prediction_score
|
|
|
104 |
"paraphrase": False,
|
105 |
"url": "",
|
106 |
}
|
107 |
+
|
108 |
# add searched_sentences to align_sentences
|
109 |
if searched_sentences["input_sentence"] != "":
|
110 |
self.found_img_url.extend(img_urls)
|
|
|
183 |
self.detect_image_origin()
|
184 |
|
185 |
def analyze_details(self):
|
186 |
+
ordinary_user_table = self.create_ordinary_user_table()
|
187 |
+
fact_checker_table = self.create_fact_checker_table()
|
188 |
+
governor_table = self.create_governor_table()
|
189 |
|
190 |
+
return ordinary_user_table, fact_checker_table, governor_table
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
def get_text_urls(self):
|
193 |
return set(self.text_referent_url)
|
|
|
214 |
if not sentence_1 or not sentence_2: # Handle empty strings
|
215 |
return []
|
216 |
|
217 |
+
s = SequenceMatcher(None, sentence_1, sentence_2)
|
218 |
common_phrases = []
|
219 |
|
220 |
for block in s.get_matching_blocks():
|
|
|
237 |
position += len(sentence_1)
|
238 |
return common_phrases, position
|
239 |
|
240 |
+
def create_fact_checker_table(self):
|
|
|
|
|
241 |
rows = []
|
242 |
max_length = 30 # TODO: put this in configuration
|
243 |
+
rows.append(self.format_image_fact_checker_row(max_length))
|
244 |
|
245 |
+
for aligned_sentence in self.aligned_sentences:
|
246 |
+
if "input_sentence" not in aligned_sentence:
|
247 |
+
continue
|
248 |
+
|
249 |
+
# Get index of equal phrases in input and source sentences
|
250 |
+
equal_idx_1, equal_idx_2 = extract_equal_text(
|
251 |
+
aligned_sentence["input_sentence"],
|
252 |
+
aligned_sentence["matched_sentence"],
|
253 |
+
)
|
254 |
+
|
255 |
+
# Get entity-words (in pair) with colors
|
256 |
+
entities_with_colors = highlight_entities(
|
257 |
+
aligned_sentence["input_sentence"],
|
258 |
+
aligned_sentence["matched_sentence"],
|
259 |
+
)
|
260 |
+
|
261 |
+
self.fact_checker_table.append(
|
262 |
+
[
|
263 |
+
aligned_sentence,
|
264 |
+
equal_idx_1,
|
265 |
+
equal_idx_2,
|
266 |
+
entities_with_colors,
|
267 |
+
]
|
268 |
+
)
|
269 |
+
|
270 |
+
for row in self.fact_checker_table:
|
271 |
+
formatted_row = self.format_text_fact_checker_row(row, max_length)
|
272 |
rows.append(formatted_row)
|
273 |
+
|
274 |
table = "\n".join(rows)
|
275 |
return f"""
|
276 |
<h5>Comparison between input news and source news</h5>
|
|
|
290 |
|
291 |
<style>
|
292 |
"""
|
293 |
+
|
294 |
+
def format_text_fact_checker_row(self, row, max_length=30):
|
295 |
+
if row[0]["input_sentence"] == "":
|
296 |
+
return ""
|
297 |
+
if row[0]["matched_sentence"] != "": # source is not empty
|
298 |
# highlight entities
|
299 |
+
input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
|
300 |
+
source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
|
|
|
301 |
|
302 |
# Color overlapping words
|
303 |
+
input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
|
304 |
+
source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source) # text, index of highlight words
|
|
|
305 |
|
306 |
input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
|
307 |
source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
|
308 |
else:
|
309 |
+
input_sentence = row[0]["input_sentence"]
|
310 |
+
source_sentence = row[0]["matched_sentence"]
|
311 |
|
312 |
+
label = row[0]["label"]
|
313 |
+
score = row[0]["similarity"]
|
314 |
|
315 |
+
url = row[0]["url"] #
|
316 |
short_url = self.shorten_url(url, max_length)
|
317 |
source_text_url = f"""<a href="{url}">{short_url}</a>"""
|
318 |
|
|
|
325 |
</tr>
|
326 |
"""
|
327 |
|
328 |
+
def format_image_fact_checker_row(self, max_length=30):
|
|
|
329 |
|
330 |
if self.image_referent_url is not None or self.image_referent_url != "":
|
331 |
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
|
|
|
336 |
source_image_url = ""
|
337 |
|
338 |
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
339 |
+
|
340 |
+
|
341 |
+
def create_ordinary_user_table(self):
|
342 |
+
rows = []
|
343 |
+
max_length = 30 # TODO: put this in configuration
|
344 |
+
rows.append(self.format_image_ordinary_user_row(max_length))
|
345 |
+
rows.append(self.format_text_ordinary_user_row(max_length))
|
346 |
+
table = "\n".join(rows)
|
347 |
+
|
348 |
+
return f"""
|
349 |
+
<h5>Comparison between input news and source news</h5>
|
350 |
+
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
351 |
+
<thead>
|
352 |
+
<tr>
|
353 |
+
<th>Input news</th>
|
354 |
+
<th>Forensic</th>
|
355 |
+
<th>Originality</th>
|
356 |
+
</tr>
|
357 |
+
</thead>
|
358 |
+
<tbody>
|
359 |
+
{table}
|
360 |
+
</tbody>
|
361 |
+
</table>
|
362 |
+
|
363 |
+
<style>
|
364 |
+
"""
|
365 |
+
|
366 |
+
def format_text_ordinary_user_row(self, max_length=30):
|
367 |
+
input_sentences = ""
|
368 |
+
source_text_urls = ""
|
369 |
+
label = ""
|
370 |
+
scores = 0
|
371 |
+
sentence_count = 0
|
372 |
+
for index, row in enumerate(self.aligned_sentences):
|
373 |
+
if row["input_sentence"] == "":
|
374 |
+
continue
|
375 |
+
input_sentences += row["input_sentence"]
|
376 |
+
label = self.aligned_sentences[index]["label"]
|
377 |
+
if label == "HUMAN":
|
378 |
+
score = self.aligned_sentences[index]["similarity"]
|
379 |
+
if label == "MACHINE":
|
380 |
+
score = 1 - self.aligned_sentences[index]["similarity"]
|
381 |
+
scores += score
|
382 |
+
|
383 |
+
url = self.aligned_sentences[index]["url"] #
|
384 |
+
short_url = self.shorten_url(url, max_length)
|
385 |
+
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
386 |
+
sentence_count += 1
|
387 |
+
|
388 |
+
if scores == 0:
|
389 |
+
label = "UNKNOWN"
|
390 |
+
else:
|
391 |
+
scores /= sentence_count
|
392 |
+
if scores > 0.5:
|
393 |
+
label = "HUMAN"
|
394 |
+
else:
|
395 |
+
label = "MACHINE"
|
396 |
+
scores = 1 - scores
|
397 |
+
|
398 |
+
return f"""
|
399 |
+
<tr>
|
400 |
+
<td>{input_sentences}</td>
|
401 |
+
<td>{label}<br>({scores*100:.2f}%)</td>
|
402 |
+
<td>{source_text_urls}</td>
|
403 |
+
</tr>
|
404 |
+
"""
|
405 |
+
|
406 |
+
def format_image_ordinary_user_row(self, max_length=30):
|
407 |
+
|
408 |
+
if self.image_referent_url is not None or self.image_referent_url != "":
|
409 |
+
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
|
410 |
+
short_url = self.shorten_url(self.image_referent_url, max_length)
|
411 |
+
source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
|
412 |
+
else:
|
413 |
+
source_image = "Image not found"
|
414 |
+
source_image_url = ""
|
415 |
+
|
416 |
+
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
417 |
+
|
418 |
+
|
419 |
+
def create_governor_table(self):
|
420 |
+
rows = []
|
421 |
+
max_length = 30 # TODO: put this in configuration
|
422 |
+
rows.append(self.format_image_governor_row(max_length))
|
423 |
+
|
424 |
+
for aligned_sentence in self.aligned_sentences:
|
425 |
+
if "input_sentence" not in aligned_sentence:
|
426 |
+
continue
|
427 |
+
|
428 |
+
# Get index of equal phrases in input and source sentences
|
429 |
+
equal_idx_1, equal_idx_2 = extract_equal_text(
|
430 |
+
aligned_sentence["input_sentence"],
|
431 |
+
aligned_sentence["matched_sentence"],
|
432 |
+
)
|
433 |
+
|
434 |
+
# Get entity-words (in pair) with colors
|
435 |
+
entities_with_colors = highlight_entities(
|
436 |
+
aligned_sentence["input_sentence"],
|
437 |
+
aligned_sentence["matched_sentence"],
|
438 |
+
)
|
439 |
+
|
440 |
+
self.governor_table.append(
|
441 |
+
[
|
442 |
+
aligned_sentence,
|
443 |
+
equal_idx_1,
|
444 |
+
equal_idx_2,
|
445 |
+
entities_with_colors,
|
446 |
+
]
|
447 |
+
)
|
448 |
+
|
449 |
+
formatted_row = self.format_text_governor_row(max_length)
|
450 |
+
rows.append(formatted_row)
|
451 |
+
|
452 |
+
table = "\n".join(rows)
|
453 |
+
return f"""
|
454 |
+
<h5>Comparison between input news and source news</h5>
|
455 |
+
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
456 |
+
<col style="width: 150px;"> <col style="width: 150px;"> <col style="width: 50px;"> <col style="width: 75px;">
|
457 |
+
<thead>
|
458 |
+
<tr>
|
459 |
+
<th>Input news</th>
|
460 |
+
<th>Source (URL provided in Originality column correspondingly)</th>
|
461 |
+
<th>Forensic</th>
|
462 |
+
<th>Originality</th>
|
463 |
+
</tr>
|
464 |
+
</thead>
|
465 |
+
<tbody>
|
466 |
+
{table}
|
467 |
+
</tbody>
|
468 |
+
</table>
|
469 |
+
|
470 |
+
<style>
|
471 |
+
"""
|
472 |
+
|
473 |
+
def format_text_governor_row(self, max_length=30):
|
474 |
+
input_sentences = ""
|
475 |
+
source_sentences = ""
|
476 |
+
source_text_urls = ""
|
477 |
+
label = ""
|
478 |
+
scores = 0
|
479 |
+
sentence_count = 0
|
480 |
+
entity_count = 0
|
481 |
+
for row in self.governor_table:
|
482 |
+
print(f"governor_row: {row}")
|
483 |
+
if row[0]["input_sentence"] == "":
|
484 |
+
continue
|
485 |
+
|
486 |
+
if row[0]["matched_sentence"] != "": # source is not empty
|
487 |
+
# highlight entities
|
488 |
+
input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input", entity_count)
|
489 |
+
source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source", entity_count)
|
490 |
+
entity_count += len(row[3])
|
491 |
+
|
492 |
+
# Color overlapping words
|
493 |
+
input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
|
494 |
+
source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source) # text, index of highlight words
|
495 |
+
|
496 |
+
input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
|
497 |
+
source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
|
498 |
+
|
499 |
+
else:
|
500 |
+
input_sentence = row[0]["input_sentence"]
|
501 |
+
source_sentence = row[0]["matched_sentence"]
|
502 |
+
|
503 |
+
input_sentences += input_sentence
|
504 |
+
source_sentences += source_sentence
|
505 |
+
score = row[0]["similarity"]
|
506 |
+
label = row[0]["label"]
|
507 |
+
if label == "HUMAN":
|
508 |
+
score = row[0]["similarity"]
|
509 |
+
if label == "MACHINE":
|
510 |
+
score = 1 - row[0]["similarity"]
|
511 |
+
scores += score
|
512 |
+
|
513 |
+
url = row[0]["url"]
|
514 |
+
short_url = self.shorten_url(url, max_length)
|
515 |
+
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
516 |
+
sentence_count += 1
|
517 |
+
|
518 |
+
if scores == 0:
|
519 |
+
label = "UNKNOWN"
|
520 |
+
else:
|
521 |
+
scores /= sentence_count
|
522 |
+
if scores > 0.5:
|
523 |
+
label = "HUMAN"
|
524 |
+
else:
|
525 |
+
label = "MACHINE"
|
526 |
+
scores = 1 - scores
|
527 |
+
|
528 |
+
return f"""
|
529 |
+
<tr>
|
530 |
+
<td>{input_sentences}</td>
|
531 |
+
<td>{source_sentences}</td>
|
532 |
+
<td>{label}<br>({score*100:.2f}%)</td>
|
533 |
+
<td>{source_text_urls}</td>
|
534 |
+
</tr>
|
535 |
+
"""
|
536 |
+
|
537 |
+
def format_image_governor_row(self, max_length=30):
|
538 |
+
if self.image_referent_url is not None or self.image_referent_url != "":
|
539 |
+
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
|
540 |
+
short_url = self.shorten_url(self.image_referent_url, max_length)
|
541 |
+
source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
|
542 |
+
else:
|
543 |
+
source_image = "Image not found"
|
544 |
+
source_image_url = ""
|
545 |
+
|
546 |
+
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
547 |
+
|
548 |
+
|
549 |
def shorten_url(self, url, max_length=30):
|
550 |
if url is None:
|
551 |
return ""
|
|
|
556 |
short_url = url
|
557 |
return short_url
|
558 |
|
559 |
+
|
560 |
def color_text(self, text, colored_idx, highlighted_idx):
|
561 |
paragraph = ""
|
562 |
words = text.split()
|
563 |
|
564 |
starts, ends = self.extract_starts_ends(colored_idx)
|
565 |
starts, ends = self.filter_indices(starts, ends, highlighted_idx)
|
566 |
+
|
|
|
|
|
567 |
previous_end = 0
|
568 |
for start, end in zip(starts, ends):
|
569 |
paragraph += " ".join(words[previous_end:start])
|
570 |
|
571 |
equal_words = " ".join(words[start:end])
|
|
|
|
|
|
|
572 |
paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
573 |
|
574 |
previous_end = end
|
src/application/text/entity.py
CHANGED
@@ -15,27 +15,28 @@ AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
|
|
15 |
AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
|
16 |
|
17 |
client = openai.AzureOpenAI(
|
18 |
-
api_version = AZURE_OPENAI_API_VERSION,
|
19 |
api_key = AZURE_OPENAI_API_KEY,
|
20 |
azure_endpoint = AZURE_OPENAI_ENDPOINT,
|
21 |
)
|
22 |
|
23 |
|
24 |
-
def extract_entities_gpt(original_text, compared_text, text_generation_model="
|
25 |
-
# "o1-mini
|
26 |
# Generate text using the selected models
|
27 |
prompt = f"""
|
28 |
Compare the ORIGINAL TEXT and the COMPARED TEXT.
|
29 |
Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
|
30 |
Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
|
31 |
* **Numerical changes:** e.g., "five" changed to "ten," "10%" changed to "50%"
|
|
|
32 |
* **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
|
33 |
* **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
|
34 |
* **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
|
35 |
|
36 |
Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
|
37 |
|
38 |
-
Output the extracted entity pairs, one pair per line, in the following JSON-like list format:
|
39 |
[
|
40 |
["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
|
41 |
["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
|
@@ -55,7 +56,7 @@ If there are no entities that satisfy above condition, output empty list "[]".
|
|
55 |
try:
|
56 |
response = client.chat.completions.create(
|
57 |
model=text_generation_model,
|
58 |
-
messages = [{"role": "
|
59 |
)
|
60 |
|
61 |
res = response.choices[0].message.content
|
@@ -69,7 +70,13 @@ If there are no entities that satisfy above condition, output empty list "[]".
|
|
69 |
def read_json(json_string) -> list[list[str]]:
|
70 |
try:
|
71 |
entities = json.loads(json_string)
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
except json.JSONDecodeError as e:
|
75 |
print(f"Error decoding JSON: {e}")
|
@@ -132,26 +139,22 @@ def assign_colors_to_entities(entities):
|
|
132 |
def highlight_entities(text1, text2):
|
133 |
if text1 == "" or text2 == "":
|
134 |
return []
|
135 |
-
|
136 |
-
print(f"text1: {text1}")
|
137 |
-
print(f"text2: {text2}")
|
138 |
entities_text = extract_entities_gpt(text1, text2)
|
139 |
print(f"entities_text: {entities_text}")
|
140 |
|
|
|
|
|
|
|
141 |
entities = read_json(entities_text)
|
142 |
-
|
143 |
# Assign colors to entities
|
144 |
entities_with_colors = assign_colors_to_entities(entities)
|
145 |
-
|
146 |
-
|
147 |
-
# Apply highlighting to entities
|
148 |
-
# highlighted_text_1 = apply_highlight(text1, entities_with_colors, "input")
|
149 |
-
# highlighted_text_2 = apply_highlight(text2, entities_with_colors, "source")
|
150 |
-
|
151 |
return entities_with_colors
|
152 |
|
153 |
|
154 |
-
def apply_highlight(text, entities_with_colors, key="input"):
|
155 |
if entities_with_colors == []:
|
156 |
return text, []
|
157 |
|
@@ -182,7 +185,7 @@ def apply_highlight(text, entities_with_colors, key="input"):
|
|
182 |
# Style the index as a label
|
183 |
index_label = (f'<span_style="background-color:{label_color};color:white;'
|
184 |
f'padding:1px_4px;border-radius:4px;font-size:12px;'
|
185 |
-
f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1}</span>')
|
186 |
|
187 |
# Append highlighted text with index label
|
188 |
highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'
|
|
|
15 |
AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
|
16 |
|
17 |
client = openai.AzureOpenAI(
|
18 |
+
api_version = "2024-05-01-preview", # AZURE_OPENAI_API_VERSION,
|
19 |
api_key = AZURE_OPENAI_API_KEY,
|
20 |
azure_endpoint = AZURE_OPENAI_ENDPOINT,
|
21 |
)
|
22 |
|
23 |
|
24 |
+
def extract_entities_gpt(original_text, compared_text, text_generation_model="o1-mini"):
|
25 |
+
# "gpt-4o-mini" or "o1-mini"
|
26 |
# Generate text using the selected models
|
27 |
prompt = f"""
|
28 |
Compare the ORIGINAL TEXT and the COMPARED TEXT.
|
29 |
Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
|
30 |
Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
|
31 |
* **Numerical changes:** e.g., "five" changed to "ten," "10%" changed to "50%"
|
32 |
+
* **Time changes:** e.g., "Monday" changed to "Sunday," "10th" changed to "21st"
|
33 |
* **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
|
34 |
* **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
|
35 |
* **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
|
36 |
|
37 |
Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
|
38 |
|
39 |
+
Output the extracted entity pairs, one pair per line, in the following JSON-like list format without wrapping characters:
|
40 |
[
|
41 |
["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
|
42 |
["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
|
|
|
56 |
try:
|
57 |
response = client.chat.completions.create(
|
58 |
model=text_generation_model,
|
59 |
+
messages = [{"role": "user", "content": prompt}],
|
60 |
)
|
61 |
|
62 |
res = response.choices[0].message.content
|
|
|
70 |
def read_json(json_string) -> list[list[str]]:
|
71 |
try:
|
72 |
entities = json.loads(json_string)
|
73 |
+
# Remove duplicates pair of entities
|
74 |
+
unique_entities = []
|
75 |
+
for inner_list in entities:
|
76 |
+
if inner_list not in unique_entities:
|
77 |
+
unique_entities.append(inner_list)
|
78 |
+
|
79 |
+
return unique_entities
|
80 |
|
81 |
except json.JSONDecodeError as e:
|
82 |
print(f"Error decoding JSON: {e}")
|
|
|
139 |
def highlight_entities(text1, text2):
|
140 |
if text1 == "" or text2 == "":
|
141 |
return []
|
142 |
+
|
|
|
|
|
143 |
entities_text = extract_entities_gpt(text1, text2)
|
144 |
print(f"entities_text: {entities_text}")
|
145 |
|
146 |
+
# Clean up entities: remove wrapping characters
|
147 |
+
entities_text = entities_text.replace("```json", "").replace("```", "")
|
148 |
+
|
149 |
entities = read_json(entities_text)
|
150 |
+
|
151 |
# Assign colors to entities
|
152 |
entities_with_colors = assign_colors_to_entities(entities)
|
153 |
+
|
|
|
|
|
|
|
|
|
|
|
154 |
return entities_with_colors
|
155 |
|
156 |
|
157 |
+
def apply_highlight(text, entities_with_colors, key="input", count = 0):
|
158 |
if entities_with_colors == []:
|
159 |
return text, []
|
160 |
|
|
|
185 |
# Style the index as a label
|
186 |
index_label = (f'<span_style="background-color:{label_color};color:white;'
|
187 |
f'padding:1px_4px;border-radius:4px;font-size:12px;'
|
188 |
+
f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>')
|
189 |
|
190 |
# Append highlighted text with index label
|
191 |
highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'
|
src/application/text/helper.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from collections import Counter
|
|
|
2 |
import re
|
3 |
import string
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
@@ -144,6 +145,30 @@ def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length
|
|
144 |
|
145 |
return important_phrases
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
def connect_consecutive_indexes(nums):
|
148 |
"""
|
149 |
Connects consecutive integers in a list.
|
|
|
1 |
from collections import Counter
|
2 |
+
from difflib import SequenceMatcher
|
3 |
import re
|
4 |
import string
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
145 |
|
146 |
return important_phrases
|
147 |
|
148 |
+
def extract_equal_text(text1, text2):
|
149 |
+
def cleanup(text):
|
150 |
+
text = text.lower()
|
151 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
152 |
+
return text
|
153 |
+
|
154 |
+
splited_text1 = cleanup(text1).split()
|
155 |
+
splited_text2 = cleanup(text2).split()
|
156 |
+
|
157 |
+
s = SequenceMatcher(None, splited_text1, splited_text2)
|
158 |
+
|
159 |
+
equal_idx_1 = []
|
160 |
+
equal_idx_2 = []
|
161 |
+
text1 = text1.split()
|
162 |
+
text2 = text2.split()
|
163 |
+
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
164 |
+
if tag == 'equal':
|
165 |
+
equal_idx_1.append({"start": i1, "end": i2})
|
166 |
+
equal_idx_2.append({"start": j1, "end": j2})
|
167 |
+
# subtext_1 = " ".join(text1[i1:i2])
|
168 |
+
# subtext_2 = " ".join(text2[j1:j2])
|
169 |
+
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
|
170 |
+
return equal_idx_1, equal_idx_2
|
171 |
+
|
172 |
def connect_consecutive_indexes(nums):
|
173 |
"""
|
174 |
Connects consecutive integers in a list.
|
src/application/text/preprocessing.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from nltk.tokenize import sent_tokenize
|
2 |
|
3 |
-
def
|
4 |
"""
|
5 |
Splits input text into sentences by newlines.
|
6 |
|
|
|
1 |
from nltk.tokenize import sent_tokenize
|
2 |
|
3 |
+
def split_into_paragraphs(input_text):
|
4 |
"""
|
5 |
Splits input text into sentences by newlines.
|
6 |
|
src/application/text/search_detection.py
CHANGED
@@ -1,9 +1,11 @@
|
|
|
|
1 |
import warnings
|
2 |
warnings.simplefilter(action='ignore', category=FutureWarning)
|
3 |
|
4 |
-
from src.application.text.preprocessing import
|
5 |
from src.application.text.search import generate_search_phrases, search_by_google
|
6 |
from src.application.url_reader import URLReader
|
|
|
7 |
import numpy as np
|
8 |
import nltk
|
9 |
import torch
|
@@ -41,7 +43,7 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
|
|
41 |
search_results = search_by_google(candidate)
|
42 |
urls = [item['link'] for item in search_results.get("items", [])]
|
43 |
|
44 |
-
for url in urls[:
|
45 |
if url in checked_urls: # visited url
|
46 |
continue
|
47 |
if "bbc.com" not in url:
|
@@ -196,14 +198,13 @@ def check_paraphrase(input_text, page_text, url):
|
|
196 |
return False, []
|
197 |
|
198 |
# Extract sentences from input text and web page
|
199 |
-
#
|
200 |
-
input_sentences =
|
201 |
|
202 |
-
|
203 |
if not page_text:
|
204 |
return is_paraphrase_text, []
|
205 |
-
|
206 |
-
page_sentences =
|
207 |
if not input_sentences or not page_sentences:
|
208 |
return is_paraphrase_text, []
|
209 |
|
@@ -213,8 +214,6 @@ def check_paraphrase(input_text, page_text, url):
|
|
213 |
additional_sentences.append(sentence.replace(", external", ""))
|
214 |
page_sentences.extend(additional_sentences)
|
215 |
|
216 |
-
# min_matching_sentences = math.ceil(len(input_sentences) * MIN_RATIO_PARAPHRASE_NUM)
|
217 |
-
|
218 |
# Encode sentences into embeddings
|
219 |
embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
|
220 |
embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
|
@@ -225,10 +224,12 @@ def check_paraphrase(input_text, page_text, url):
|
|
225 |
# Find sentence alignments
|
226 |
alignment = {}
|
227 |
paraphrased_sentence_count = 0
|
|
|
228 |
for i, sentence1 in enumerate(input_sentences):
|
229 |
max_sim_index = np.argmax(similarity_matrix[i])
|
230 |
max_similarity = similarity_matrix[i][max_sim_index]
|
231 |
-
|
|
|
232 |
is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
|
233 |
|
234 |
if is_paraphrase_sentence is False:
|
@@ -262,9 +263,20 @@ def check_paraphrase(input_text, page_text, url):
|
|
262 |
# Check if enough sentences are paraphrases
|
263 |
|
264 |
is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
|
265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
return is_paraphrase_text, alignment
|
267 |
|
|
|
268 |
def similarity_ratio(a, b):
|
269 |
"""
|
270 |
Calculates the similarity ratio between two strings using SequenceMatcher.
|
|
|
1 |
+
import string
|
2 |
import warnings
|
3 |
warnings.simplefilter(action='ignore', category=FutureWarning)
|
4 |
|
5 |
+
from src.application.text.preprocessing import split_into_paragraphs
|
6 |
from src.application.text.search import generate_search_phrases, search_by_google
|
7 |
from src.application.url_reader import URLReader
|
8 |
+
from src.application.text.helper import extract_equal_text
|
9 |
import numpy as np
|
10 |
import nltk
|
11 |
import torch
|
|
|
43 |
search_results = search_by_google(candidate)
|
44 |
urls = [item['link'] for item in search_results.get("items", [])]
|
45 |
|
46 |
+
for url in urls[:3]:
|
47 |
if url in checked_urls: # visited url
|
48 |
continue
|
49 |
if "bbc.com" not in url:
|
|
|
198 |
return False, []
|
199 |
|
200 |
# Extract sentences from input text and web page
|
201 |
+
# input_sentences = split_into_paragraphs(input_text)
|
202 |
+
input_sentences = [input_text]
|
203 |
|
|
|
204 |
if not page_text:
|
205 |
return is_paraphrase_text, []
|
206 |
+
|
207 |
+
page_sentences = split_into_paragraphs(page_text)
|
208 |
if not input_sentences or not page_sentences:
|
209 |
return is_paraphrase_text, []
|
210 |
|
|
|
214 |
additional_sentences.append(sentence.replace(", external", ""))
|
215 |
page_sentences.extend(additional_sentences)
|
216 |
|
|
|
|
|
217 |
# Encode sentences into embeddings
|
218 |
embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
|
219 |
embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
|
|
|
224 |
# Find sentence alignments
|
225 |
alignment = {}
|
226 |
paraphrased_sentence_count = 0
|
227 |
+
best_matched_sentence = ""
|
228 |
for i, sentence1 in enumerate(input_sentences):
|
229 |
max_sim_index = np.argmax(similarity_matrix[i])
|
230 |
max_similarity = similarity_matrix[i][max_sim_index]
|
231 |
+
|
232 |
+
best_matched_sentence = page_sentences[max_sim_index]
|
233 |
is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
|
234 |
|
235 |
if is_paraphrase_sentence is False:
|
|
|
263 |
# Check if enough sentences are paraphrases
|
264 |
|
265 |
is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
|
266 |
+
|
267 |
+
# Method 2: Check if overlapped words between sentences are more than 50%
|
268 |
+
equal_idx_1, _ = extract_equal_text(input_sentences[0], best_matched_sentence)
|
269 |
+
matched_count = 0
|
270 |
+
for index in equal_idx_1:
|
271 |
+
matched_count += index["end"] - index["start"]
|
272 |
+
sent = input_sentences[0].translate(str.maketrans('', '', string.punctuation))
|
273 |
+
num_words = len(sent.split())
|
274 |
+
if matched_count > num_words / 2:
|
275 |
+
is_paraphrase_text = True
|
276 |
+
|
277 |
return is_paraphrase_text, alignment
|
278 |
|
279 |
+
|
280 |
def similarity_ratio(a, b):
|
281 |
"""
|
282 |
Calculates the similarity ratio between two strings using SequenceMatcher.
|
test.py
CHANGED
@@ -1,46 +1,27 @@
|
|
1 |
-
|
2 |
-
import string
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
s = SequenceMatcher(None, splited_text1, splited_text2)
|
14 |
-
|
15 |
-
equal_idx_1 = []
|
16 |
-
equal_idx_2 = []
|
17 |
-
text1 = text1.split()
|
18 |
-
text2 = text2.split()
|
19 |
-
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
20 |
-
if tag == 'equal':
|
21 |
-
equal_idx_1.append({"start": i1, "end": i2})
|
22 |
-
equal_idx_2.append({"start": j1, "end": j2})
|
23 |
-
subtext_1 = " ".join(text1[i1:i2])
|
24 |
-
subtext_2 = " ".join(text2[j1:j2])
|
25 |
-
print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] {subtext_1!r:>55} --> {subtext_2!r}')
|
26 |
-
|
27 |
-
return equal_idx_1, equal_idx_2
|
28 |
-
|
29 |
-
text1 = """
|
30 |
-
Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m.
|
31 |
-
Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
|
32 |
-
He made a substitute appearance and waved farewell to fans in Newcastle's recent win against Southampton.
|
33 |
-
Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2022-23, and scored against Paris St-Germain in the Champions League.
|
34 |
"""
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
1 |
+
import json
|
|
|
2 |
|
3 |
+
text = """```json
|
4 |
+
[
|
5 |
+
["Sunday", "Thursday"],
|
6 |
+
["two millions", "one million"],
|
7 |
+
["north", "east"],
|
8 |
+
["Japan", "UK"],
|
9 |
+
["Sunday", "Thursday"]
|
10 |
+
]
|
11 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"""
|
13 |
+
def read_json(json_string) -> list[list[str]]:
|
14 |
+
try:
|
15 |
+
entities = json.loads(json_string)
|
16 |
+
# Remove duplicates pair of entities
|
17 |
+
unique_data = []
|
18 |
+
for inner_list in entities:
|
19 |
+
if inner_list not in unique_data:
|
20 |
+
unique_data.append(inner_list)
|
21 |
+
|
22 |
+
return unique_data
|
23 |
|
24 |
+
except json.JSONDecodeError as e:
|
25 |
+
print(f"Error decoding JSON: {e}")
|
26 |
+
return []
|
27 |
+
print(read_json(text.replace("```json", "").replace("```", "")))
|