pmkhanh7890 commited on
Commit
38fd181
Β·
1 Parent(s): 504f37b

run pre-commit

Browse files
.sample-env CHANGED
@@ -1,4 +1,4 @@
1
  [API_KEY]
2
  OPENAI_API_KEY=your_api_key # Replace with your actual OpenAI API key
3
  GEMINI_API_KEY=your_api_key
4
- TOGETHER_API_KEY=your_api_key
 
1
  [API_KEY]
2
  OPENAI_API_KEY=your_api_key # Replace with your actual OpenAI API key
3
  GEMINI_API_KEY=your_api_key
4
+ TOGETHER_API_KEY=your_api_key
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
- title: "FAKE NEWS DETECTION"
3
- emoji: "πŸš€"
4
- colorFrom: "green"
5
- colorTo: "blue"
6
- sdk: "gradio"
7
- sdk_version: "5.13.1"
8
- app_file: "application.py"
9
  pinned: false
10
  ---
11
 
 
1
  ---
2
+ title: "FAKE NEWS DETECTION"
3
+ emoji: "πŸš€"
4
+ colorFrom: "green"
5
+ colorTo: "blue"
6
+ sdk: "gradio"
7
+ sdk_version: "5.13.1"
8
+ app_file: "application.py"
9
  pinned: false
10
  ---
11
 
application.py CHANGED
@@ -1,44 +1,53 @@
1
- import os
2
-
3
  import gradio as gr
4
  import requests
5
  from PIL import Image
6
 
7
  from src.application.content_detection import NewsVerification
 
 
 
 
 
8
  from src.application.url_reader import URLReader
9
- from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
10
 
11
  AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
12
  AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
13
 
 
14
  def load_url(url):
15
  """
16
  Load content from the given URL.
17
  """
18
  content = URLReader(url)
19
  image = None
20
- header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
 
 
21
  try:
22
  response = requests.get(
23
- url,
24
- headers = header,
25
- stream = True
26
  )
27
  response.raise_for_status() # Raise an exception for bad status codes
28
-
29
  image_response = requests.get(content.top_image, stream=True)
30
  try:
31
  image = Image.open(image_response.raw)
32
- except:
33
- print(f"Error loading image from {content.top_image}")
34
-
35
  except (requests.exceptions.RequestException, FileNotFoundError) as e:
36
  print(f"Error fetching image: {e}")
37
 
38
  return content.title, content.text, image
39
 
40
 
41
- def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
 
 
 
 
42
  news_analysis = NewsVerification()
43
  news_analysis.load_news(news_title, news_content, news_image)
44
  news_analysis.generate_analysis_report()
@@ -48,80 +57,100 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
48
  # Define the GUI
49
  with gr.Blocks() as demo:
50
  gr.Markdown("# NEWS VERIFICATION")
51
-
52
  with gr.Row():
53
- # SETTINGS
54
  with gr.Column(scale=1):
55
- with gr.Accordion("1. Enter a URL"):
56
- url_input = gr.Textbox(
57
- label="",
58
- show_label=False,
59
- value="",
60
- )
61
- load_button = gr.Button("Load URL")
62
-
63
- with gr.Accordion("2. Select content-generation models", open=True, visible=False):
64
- with gr.Row():
65
- text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
66
- image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
67
- generate_text_button = gr.Button("Generate text")
68
- generate_image_button = gr.Button("Generate image")
69
-
70
- with gr.Accordion("3. Replace any terms", open=True, visible=False):
71
- replace_df = gr.Dataframe(
72
- headers=["Find what:", "Replace with:"],
73
- datatype=["str", "str"],
74
- row_count=(1, "dynamic"),
75
- col_count=(2, "fixed"),
76
- interactive=True
77
  )
78
- replace_button = gr.Button("Replace all")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # GENERATED CONTENT
81
- with gr.Accordion("Input News"):
82
- news_title = gr.Textbox(label="Title", value="")
83
- news_image = gr.Image(label="Image", type="filepath")
84
- news_content = gr.Textbox(label="Content", value="", lines=13)
85
 
86
  # NEWS ANALYSIS REPORT
87
  ordinary_user_explanation = """
88
- FOR ORDINARY USER<br>
89
- - Green texts are the matched words in the input and source news.<br>
90
- - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
 
91
  """
92
  fact_checker_explanation = """
93
- FOR FACT CHECKER<br>
94
- - Green texts are the matched words in the input and source news.<br>
95
- - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
 
96
  """
97
  governor_explanation = """
98
- FOR GOVERNOR<br>
99
- - Green texts are the matched words in the input and source news.<br>
100
- - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
 
101
  """
102
  table = """
103
- <h5>Comparison between input news and source news:</h5>
104
- <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
105
- <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
106
- <thead>
107
- <tr>
108
- <th>Input news</th>
109
- <th>Source (URL provided in Originality column correspondingly)</th>
110
- <th>Forensic</th>
111
- <th>Originality</th>
112
- </tr>
113
- </thead>
114
- <tbody>
115
- <tr>
116
- <th>TBD</th>
117
- <th>TBD</th>
118
- <th>TBD</th>
119
- <th>TBD</th>
120
- </tr>
121
- </tbody>
122
- </table>
123
-
124
- <style>"""
 
 
 
125
  with gr.Column(scale=2):
126
  with gr.Accordion("NEWS ANALYSIS"):
127
  verification_button = gr.Button("Verify news")
@@ -137,56 +166,79 @@ with gr.Blocks() as demo:
137
 
138
  # Connect events
139
  load_button.click(
140
- load_url,
141
- inputs=url_input,
142
- outputs=[news_title, news_content, news_image]
143
- )
144
- replace_button.click(replace_text,
145
- inputs=[news_title, news_content, replace_df],
146
- outputs=[news_title, news_content])
147
- generate_text_button.click(generate_fake_text,
148
- inputs=[text_generation_model, news_title, news_content],
149
- outputs=[news_title, news_content])
150
- generate_image_button.click(generate_fake_image,
151
- inputs=[image_generation_model, news_title],
152
- outputs=[news_image])
153
- verification_button.click(generate_analysis_report,
154
- inputs=[news_title, news_content, news_image],
155
- outputs=[ordinary_user_result, fact_checker_result, governor_result])
 
 
 
 
 
 
 
 
156
 
157
  # change Image
158
- #url_input.change(load_image, inputs=url_input, outputs=image_view)
159
-
160
  try:
161
- with open('examples/example_text_real.txt','r', encoding='utf-8') as file:
 
 
 
162
  text_real_1 = file.read()
163
- with open('examples/example_text_real_2.txt','r', encoding='utf-8') as file:
 
 
 
164
  text_real_2 = file.read()
165
- with open('examples/example_text_LLM_topic.txt','r', encoding='utf-8') as file:
 
 
 
166
  text_llm_topic = file.read()
167
- with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
 
 
 
168
  text_llm_modification = file.read()
169
- with open('examples/example_text_LLM_entities.txt','r', encoding='utf-8') as file:
 
 
 
170
  text_llm_entities = file.read()
171
  except FileNotFoundError:
172
  print("File not found.")
173
  except Exception as e:
174
  print(f"An error occurred: {e}")
175
-
176
  title_1 = "Southampton news: Leeds target striker Cameron Archer."
177
  title_2 = "Southampton news: Leeds target striker Cameron Archer."
178
  title_4 = "Japan pledges support for Ukraine with 100-year pact."
179
-
180
  image_1 = "examples/example_image_real_1.jpg.webp"
181
  image_2 = "examples/example_image_real_2.jpg.webp"
182
  image_3 = "examples/example_image_real_3.jpg"
183
  image_4 = "examples/example_image_real_4.jpg.webp"
184
-
185
  gr.Examples(
186
  examples=[
187
- [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
188
- [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
189
- [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
190
  [title_4, image_4, text_llm_entities],
191
  ],
192
  inputs=[news_title, news_image, news_content],
@@ -199,4 +251,4 @@ with gr.Blocks() as demo:
199
  ],
200
  )
201
 
202
- demo.launch(share=True)
 
 
 
1
  import gradio as gr
2
  import requests
3
  from PIL import Image
4
 
5
  from src.application.content_detection import NewsVerification
6
+ from src.application.content_generation import (
7
+ generate_fake_image,
8
+ generate_fake_text,
9
+ replace_text,
10
+ )
11
  from src.application.url_reader import URLReader
 
12
 
13
  AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
14
  AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
15
 
16
+
17
  def load_url(url):
18
  """
19
  Load content from the given URL.
20
  """
21
  content = URLReader(url)
22
  image = None
23
+ header = {
24
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", # noqa: E501
25
+ }
26
  try:
27
  response = requests.get(
28
+ url,
29
+ headers=header,
30
+ stream=True,
31
  )
32
  response.raise_for_status() # Raise an exception for bad status codes
33
+
34
  image_response = requests.get(content.top_image, stream=True)
35
  try:
36
  image = Image.open(image_response.raw)
37
+ except OSError as e:
38
+ print(f"Error loading image from {content.top_image}: {e}")
39
+
40
  except (requests.exceptions.RequestException, FileNotFoundError) as e:
41
  print(f"Error fetching image: {e}")
42
 
43
  return content.title, content.text, image
44
 
45
 
46
+ def generate_analysis_report(
47
+ news_title: str,
48
+ news_content: str,
49
+ news_image: Image,
50
+ ):
51
  news_analysis = NewsVerification()
52
  news_analysis.load_news(news_title, news_content, news_image)
53
  news_analysis.generate_analysis_report()
 
57
  # Define the GUI
58
  with gr.Blocks() as demo:
59
  gr.Markdown("# NEWS VERIFICATION")
60
+
61
  with gr.Row():
62
+ # SETTINGS
63
  with gr.Column(scale=1):
64
+ with gr.Accordion("1. Enter a URL"):
65
+ url_input = gr.Textbox(
66
+ label="",
67
+ show_label=False,
68
+ value="",
69
+ )
70
+ load_button = gr.Button("Load URL")
71
+
72
+ with gr.Accordion(
73
+ "2. Select content-generation models",
74
+ open=True,
75
+ visible=False,
76
+ ):
77
+ with gr.Row():
78
+ text_generation_model = gr.Dropdown(
79
+ choices=AZURE_TEXT_MODEL,
80
+ label="Text-generation model",
81
+ )
82
+ image_generation_model = gr.Dropdown(
83
+ choices=AZURE_IMAGE_MODEL,
84
+ label="Image-generation model",
 
85
  )
86
+ generate_text_button = gr.Button("Generate text")
87
+ generate_image_button = gr.Button("Generate image")
88
+
89
+ with gr.Accordion(
90
+ "3. Replace any terms",
91
+ open=True,
92
+ visible=False,
93
+ ):
94
+ replace_df = gr.Dataframe(
95
+ headers=["Find what:", "Replace with:"],
96
+ datatype=["str", "str"],
97
+ row_count=(1, "dynamic"),
98
+ col_count=(2, "fixed"),
99
+ interactive=True,
100
+ )
101
+ replace_button = gr.Button("Replace all")
102
 
103
+ # GENERATED CONTENT
104
+ with gr.Accordion("Input News"):
105
+ news_title = gr.Textbox(label="Title", value="")
106
+ news_image = gr.Image(label="Image", type="filepath")
107
+ news_content = gr.Textbox(label="Content", value="", lines=13)
108
 
109
  # NEWS ANALYSIS REPORT
110
  ordinary_user_explanation = """
111
+ FOR ORDINARY USER<br>
112
+ - Green texts are the matched words in the input and source news.<br>
113
+ - Each highlighted pair (marked with a number) shows the key differences
114
+ between the input text and the source.
115
  """
116
  fact_checker_explanation = """
117
+ FOR FACT CHECKER<br>
118
+ - Green texts are the matched words in the input and source news.<br>
119
+ - Each highlighted pair (marked with a number) shows the key differences
120
+ between the input text and the source.
121
  """
122
  governor_explanation = """
123
+ FOR GOVERNOR<br>
124
+ - Green texts are the matched words in the input and source news.<br>
125
+ - Each highlighted pair (marked with a number) shows the key differences
126
+ between the input text and the source.
127
  """
128
  table = """
129
+ <h5>Comparison between input news and source news:</h5>
130
+ <table border="1" style="width:100%; text-align:left;">
131
+ <col style="width: 170px;">
132
+ <col style="width: 170px;">
133
+ <col style="width: 30px;">
134
+ <col style="width: 75px;">
135
+ <thead>
136
+ <tr>
137
+ <th>Input news</th>
138
+ <th>Source (corresponding URL provided in Originality)</th>
139
+ <th>Forensic</th>
140
+ <th>Originality</th>
141
+ </tr>
142
+ </thead>
143
+ <tbody>
144
+ <tr>
145
+ <th>TBD</th>
146
+ <th>TBD</th>
147
+ <th>TBD</th>
148
+ <th>TBD</th>
149
+ </tr>
150
+ </tbody>
151
+ </table>
152
+
153
+ <style>"""
154
  with gr.Column(scale=2):
155
  with gr.Accordion("NEWS ANALYSIS"):
156
  verification_button = gr.Button("Verify news")
 
166
 
167
  # Connect events
168
  load_button.click(
169
+ load_url,
170
+ inputs=url_input,
171
+ outputs=[news_title, news_content, news_image],
172
+ )
173
+ replace_button.click(
174
+ replace_text,
175
+ inputs=[news_title, news_content, replace_df],
176
+ outputs=[news_title, news_content],
177
+ )
178
+ generate_text_button.click(
179
+ generate_fake_text,
180
+ inputs=[text_generation_model, news_title, news_content],
181
+ outputs=[news_title, news_content],
182
+ )
183
+ generate_image_button.click(
184
+ generate_fake_image,
185
+ inputs=[image_generation_model, news_title],
186
+ outputs=[news_image],
187
+ )
188
+ verification_button.click(
189
+ generate_analysis_report,
190
+ inputs=[news_title, news_content, news_image],
191
+ outputs=[ordinary_user_result, fact_checker_result, governor_result],
192
+ )
193
 
194
  # change Image
195
+ # url_input.change(load_image, inputs=url_input, outputs=image_view)
196
+
197
  try:
198
+ with open(
199
+ "examples/example_text_real.txt",
200
+ encoding="utf-8",
201
+ ) as file:
202
  text_real_1 = file.read()
203
+ with open(
204
+ "examples/example_text_real_2.txt",
205
+ encoding="utf-8",
206
+ ) as file:
207
  text_real_2 = file.read()
208
+ with open(
209
+ "examples/example_text_LLM_topic.txt",
210
+ encoding="utf-8",
211
+ ) as file:
212
  text_llm_topic = file.read()
213
+ with open(
214
+ "examples/example_text_LLM_modification.txt",
215
+ encoding="utf-8",
216
+ ) as file:
217
  text_llm_modification = file.read()
218
+ with open(
219
+ "examples/example_text_LLM_entities.txt",
220
+ encoding="utf-8",
221
+ ) as file:
222
  text_llm_entities = file.read()
223
  except FileNotFoundError:
224
  print("File not found.")
225
  except Exception as e:
226
  print(f"An error occurred: {e}")
227
+
228
  title_1 = "Southampton news: Leeds target striker Cameron Archer."
229
  title_2 = "Southampton news: Leeds target striker Cameron Archer."
230
  title_4 = "Japan pledges support for Ukraine with 100-year pact."
231
+
232
  image_1 = "examples/example_image_real_1.jpg.webp"
233
  image_2 = "examples/example_image_real_2.jpg.webp"
234
  image_3 = "examples/example_image_real_3.jpg"
235
  image_4 = "examples/example_image_real_4.jpg.webp"
236
+
237
  gr.Examples(
238
  examples=[
239
+ [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
240
+ [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
241
+ [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
242
  [title_4, image_4, text_llm_entities],
243
  ],
244
  inputs=[news_title, news_image, news_content],
 
251
  ],
252
  )
253
 
254
+ demo.launch(share=True)
application_2.py CHANGED
@@ -1,44 +1,53 @@
1
- import os
2
-
3
  import gradio as gr
4
  import requests
5
  from PIL import Image
6
 
7
  from src.application.content_detection import NewsVerification
 
 
 
 
 
8
  from src.application.url_reader import URLReader
9
- from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
10
 
11
  AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
12
  AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
13
 
 
14
  def load_url(url):
15
  """
16
  Load content from the given URL.
17
  """
18
  content = URLReader(url)
19
  image = None
20
- header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
 
 
21
  try:
22
  response = requests.get(
23
- url,
24
- headers = header,
25
- stream = True
26
  )
27
  response.raise_for_status() # Raise an exception for bad status codes
28
-
29
  image_response = requests.get(content.top_image, stream=True)
30
  try:
31
  image = Image.open(image_response.raw)
32
- except:
33
- print(f"Error loading image from {content.top_image}")
34
-
35
  except (requests.exceptions.RequestException, FileNotFoundError) as e:
36
  print(f"Error fetching image: {e}")
37
 
38
  return content.title, content.text, image
39
 
40
 
41
- def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
 
 
 
 
42
  news_analysis = NewsVerification()
43
  news_analysis.load_news(news_title, news_content, news_image)
44
  news_analysis.generate_analysis_report()
@@ -48,80 +57,100 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
48
  # Define the GUI
49
  with gr.Blocks() as demo:
50
  gr.Markdown("# NEWS VERIFICATION")
51
-
52
  with gr.Row():
53
- # SETTINGS
54
  with gr.Column(scale=1):
55
- with gr.Accordion("1. Enter a URL"):
56
- url_input = gr.Textbox(
57
- label="",
58
- show_label=False,
59
- value="",
60
- )
61
- load_button = gr.Button("Load URL")
62
-
63
- with gr.Accordion("2. Select content-generation models", open=True, visible=False):
64
- with gr.Row():
65
- text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
66
- image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
67
- generate_text_button = gr.Button("Generate text")
68
- generate_image_button = gr.Button("Generate image")
69
-
70
- with gr.Accordion("3. Replace any terms", open=True, visible=False):
71
- replace_df = gr.Dataframe(
72
- headers=["Find what:", "Replace with:"],
73
- datatype=["str", "str"],
74
- row_count=(1, "dynamic"),
75
- col_count=(2, "fixed"),
76
- interactive=True
77
  )
78
- replace_button = gr.Button("Replace all")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # GENERATED CONTENT
81
- with gr.Accordion("Input News"):
82
- news_title = gr.Textbox(label="Title", value="")
83
- news_image = gr.Image(label="Image", type="filepath")
84
- news_content = gr.Textbox(label="Content", value="", lines=13)
85
 
86
  # NEWS ANALYSIS REPORT
87
  ordinary_user_explanation = """
88
- FOR ORDINARY USER<br>
89
- - Green texts are the matched words in the input and source news.<br>
90
- - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
 
91
  """
92
  fact_checker_explanation = """
93
- FOR FACT CHECKER<br>
94
- - Green texts are the matched words in the input and source news.<br>
95
- - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
 
96
  """
97
  governor_explanation = """
98
- FOR GOVERNOR<br>
99
- - Green texts are the matched words in the input and source news.<br>
100
- - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
 
101
  """
102
  table = """
103
- <h5>Comparison between input news and source news:</h5>
104
- <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
105
- <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
106
- <thead>
107
- <tr>
108
- <th>Input news</th>
109
- <th>Source (URL provided in Originality column correspondingly)</th>
110
- <th>Forensic</th>
111
- <th>Originality</th>
112
- </tr>
113
- </thead>
114
- <tbody>
115
- <tr>
116
- <th>TBD</th>
117
- <th>TBD</th>
118
- <th>TBD</th>
119
- <th>TBD</th>
120
- </tr>
121
- </tbody>
122
- </table>
123
-
124
- <style>"""
 
 
 
125
  with gr.Column(scale=2):
126
  with gr.Accordion("NEWS ANALYSIS"):
127
  verification_button = gr.Button("Verify news")
@@ -137,56 +166,79 @@ with gr.Blocks() as demo:
137
 
138
  # Connect events
139
  load_button.click(
140
- load_url,
141
- inputs=url_input,
142
- outputs=[news_title, news_content, news_image]
143
- )
144
- replace_button.click(replace_text,
145
- inputs=[news_title, news_content, replace_df],
146
- outputs=[news_title, news_content])
147
- generate_text_button.click(generate_fake_text,
148
- inputs=[text_generation_model, news_title, news_content],
149
- outputs=[news_title, news_content])
150
- generate_image_button.click(generate_fake_image,
151
- inputs=[image_generation_model, news_title],
152
- outputs=[news_image])
153
- verification_button.click(generate_analysis_report,
154
- inputs=[news_title, news_content, news_image],
155
- outputs=[ordinary_user_result, fact_checker_result, governor_result])
 
 
 
 
 
 
 
 
156
 
157
  # change Image
158
- #url_input.change(load_image, inputs=url_input, outputs=image_view)
159
-
160
  try:
161
- with open('examples/example_text_real.txt','r', encoding='utf-8') as file:
 
 
 
162
  text_real_1 = file.read()
163
- with open('examples/example_text_real_2.txt','r', encoding='utf-8') as file:
 
 
 
164
  text_real_2 = file.read()
165
- with open('examples/example_text_LLM_topic.txt','r', encoding='utf-8') as file:
 
 
 
166
  text_llm_topic = file.read()
167
- with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
 
 
 
168
  text_llm_modification = file.read()
169
- with open('examples/example_text_LLM_entities.txt','r', encoding='utf-8') as file:
 
 
 
170
  text_llm_entities = file.read()
171
  except FileNotFoundError:
172
  print("File not found.")
173
  except Exception as e:
174
  print(f"An error occurred: {e}")
175
-
176
  title_1 = "Southampton news: Leeds target striker Cameron Archer."
177
  title_2 = "Southampton news: Leeds target striker Cameron Archer."
178
  title_4 = "Japan pledges support for Ukraine with 100-year pact."
179
-
180
  image_1 = "examples/example_image_real_1.jpg.webp"
181
  image_2 = "examples/example_image_real_2.jpg.webp"
182
  image_3 = "examples/example_image_real_3.jpg"
183
  image_4 = "examples/example_image_real_4.jpg.webp"
184
-
185
  gr.Examples(
186
  examples=[
187
- [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
188
- [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
189
- [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
190
  [title_4, image_4, text_llm_entities],
191
  ],
192
  inputs=[news_title, news_image, news_content],
@@ -199,4 +251,4 @@ with gr.Blocks() as demo:
199
  ],
200
  )
201
 
202
- demo.launch(share=True)
 
 
 
1
  import gradio as gr
2
  import requests
3
  from PIL import Image
4
 
5
  from src.application.content_detection import NewsVerification
6
+ from src.application.content_generation import (
7
+ generate_fake_image,
8
+ generate_fake_text,
9
+ replace_text,
10
+ )
11
  from src.application.url_reader import URLReader
 
12
 
13
  AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
14
  AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
15
 
16
+
17
  def load_url(url):
18
  """
19
  Load content from the given URL.
20
  """
21
  content = URLReader(url)
22
  image = None
23
+ header = {
24
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", # noqa: E501
25
+ }
26
  try:
27
  response = requests.get(
28
+ url,
29
+ headers=header,
30
+ stream=True,
31
  )
32
  response.raise_for_status() # Raise an exception for bad status codes
33
+
34
  image_response = requests.get(content.top_image, stream=True)
35
  try:
36
  image = Image.open(image_response.raw)
37
+ except OSError as e:
38
+ print(f"Error loading image from {content.top_image}: {e}")
39
+
40
  except (requests.exceptions.RequestException, FileNotFoundError) as e:
41
  print(f"Error fetching image: {e}")
42
 
43
  return content.title, content.text, image
44
 
45
 
46
+ def generate_analysis_report(
47
+ news_title: str,
48
+ news_content: str,
49
+ news_image: Image,
50
+ ):
51
  news_analysis = NewsVerification()
52
  news_analysis.load_news(news_title, news_content, news_image)
53
  news_analysis.generate_analysis_report()
 
57
  # Define the GUI
58
  with gr.Blocks() as demo:
59
  gr.Markdown("# NEWS VERIFICATION")
60
+
61
  with gr.Row():
62
+ # SETTINGS
63
  with gr.Column(scale=1):
64
+ with gr.Accordion("1. Enter a URL"):
65
+ url_input = gr.Textbox(
66
+ label="",
67
+ show_label=False,
68
+ value="",
69
+ )
70
+ load_button = gr.Button("Load URL")
71
+
72
+ with gr.Accordion(
73
+ "2. Select content-generation models",
74
+ open=True,
75
+ visible=False,
76
+ ):
77
+ with gr.Row():
78
+ text_generation_model = gr.Dropdown(
79
+ choices=AZURE_TEXT_MODEL,
80
+ label="Text-generation model",
81
+ )
82
+ image_generation_model = gr.Dropdown(
83
+ choices=AZURE_IMAGE_MODEL,
84
+ label="Image-generation model",
 
85
  )
86
+ generate_text_button = gr.Button("Generate text")
87
+ generate_image_button = gr.Button("Generate image")
88
+
89
+ with gr.Accordion(
90
+ "3. Replace any terms",
91
+ open=True,
92
+ visible=False,
93
+ ):
94
+ replace_df = gr.Dataframe(
95
+ headers=["Find what:", "Replace with:"],
96
+ datatype=["str", "str"],
97
+ row_count=(1, "dynamic"),
98
+ col_count=(2, "fixed"),
99
+ interactive=True,
100
+ )
101
+ replace_button = gr.Button("Replace all")
102
 
103
+ # GENERATED CONTENT
104
+ with gr.Accordion("Input News"):
105
+ news_title = gr.Textbox(label="Title", value="")
106
+ news_image = gr.Image(label="Image", type="filepath")
107
+ news_content = gr.Textbox(label="Content", value="", lines=13)
108
 
109
  # NEWS ANALYSIS REPORT
110
  ordinary_user_explanation = """
111
+ FOR ORDINARY USER<br>
112
+ - Green texts are the matched words in the input and source news.<br>
113
+ - Each highlighted pair (marked with a number) shows the key differences
114
+ between the input text and the source.
115
  """
116
  fact_checker_explanation = """
117
+ FOR FACT CHECKER<br>
118
+ - Green texts are the matched words in the input and source news.<br>
119
+ - Each highlighted pair (marked with a number) shows the key differences
120
+ between the input text and the source.
121
  """
122
  governor_explanation = """
123
+ FOR GOVERNOR<br>
124
+ - Green texts are the matched words in the input and source news.<br>
125
+ - Each highlighted pair (marked with a number) shows the key differences
126
+ between the input text and the source.
127
  """
128
  table = """
129
+ <h5>Comparison between input news and source news:</h5>
130
+ <table border="1" style="width:100%; text-align:left;">
131
+ <col style="width: 170px;">
132
+ <col style="width: 170px;">
133
+ <col style="width: 30px;">
134
+ <col style="width: 75px;">
135
+ <thead>
136
+ <tr>
137
+ <th>Input news</th>
138
+ <th>Source (corresponding URL provided in Originality)</th>
139
+ <th>Forensic</th>
140
+ <th>Originality</th>
141
+ </tr>
142
+ </thead>
143
+ <tbody>
144
+ <tr>
145
+ <th>TBD</th>
146
+ <th>TBD</th>
147
+ <th>TBD</th>
148
+ <th>TBD</th>
149
+ </tr>
150
+ </tbody>
151
+ </table>
152
+
153
+ <style>"""
154
  with gr.Column(scale=2):
155
  with gr.Accordion("NEWS ANALYSIS"):
156
  verification_button = gr.Button("Verify news")
 
166
 
167
  # Connect events
168
  load_button.click(
169
+ load_url,
170
+ inputs=url_input,
171
+ outputs=[news_title, news_content, news_image],
172
+ )
173
+ replace_button.click(
174
+ replace_text,
175
+ inputs=[news_title, news_content, replace_df],
176
+ outputs=[news_title, news_content],
177
+ )
178
+ generate_text_button.click(
179
+ generate_fake_text,
180
+ inputs=[text_generation_model, news_title, news_content],
181
+ outputs=[news_title, news_content],
182
+ )
183
+ generate_image_button.click(
184
+ generate_fake_image,
185
+ inputs=[image_generation_model, news_title],
186
+ outputs=[news_image],
187
+ )
188
+ verification_button.click(
189
+ generate_analysis_report,
190
+ inputs=[news_title, news_content, news_image],
191
+ outputs=[ordinary_user_result, fact_checker_result, governor_result],
192
+ )
193
 
194
  # change Image
195
+ # url_input.change(load_image, inputs=url_input, outputs=image_view)
196
+
197
  try:
198
+ with open(
199
+ "examples/example_text_real.txt",
200
+ encoding="utf-8",
201
+ ) as file:
202
  text_real_1 = file.read()
203
+ with open(
204
+ "examples/example_text_real_2.txt",
205
+ encoding="utf-8",
206
+ ) as file:
207
  text_real_2 = file.read()
208
+ with open(
209
+ "examples/example_text_LLM_topic.txt",
210
+ encoding="utf-8",
211
+ ) as file:
212
  text_llm_topic = file.read()
213
+ with open(
214
+ "examples/example_text_LLM_modification.txt",
215
+ encoding="utf-8",
216
+ ) as file:
217
  text_llm_modification = file.read()
218
+ with open(
219
+ "examples/example_text_LLM_entities.txt",
220
+ encoding="utf-8",
221
+ ) as file:
222
  text_llm_entities = file.read()
223
  except FileNotFoundError:
224
  print("File not found.")
225
  except Exception as e:
226
  print(f"An error occurred: {e}")
227
+
228
  title_1 = "Southampton news: Leeds target striker Cameron Archer."
229
  title_2 = "Southampton news: Leeds target striker Cameron Archer."
230
  title_4 = "Japan pledges support for Ukraine with 100-year pact."
231
+
232
  image_1 = "examples/example_image_real_1.jpg.webp"
233
  image_2 = "examples/example_image_real_2.jpg.webp"
234
  image_3 = "examples/example_image_real_3.jpg"
235
  image_4 = "examples/example_image_real_4.jpg.webp"
236
+
237
  gr.Examples(
238
  examples=[
239
+ [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
240
+ [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
241
+ [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
242
  [title_4, image_4, text_llm_entities],
243
  ],
244
  inputs=[news_title, news_image, news_content],
 
251
  ],
252
  )
253
 
254
+ demo.launch(share=True)
application_3.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from PIL import Image
4
+
5
+ from src.application.content_detection import NewsVerification
6
+ from src.application.content_generation import (
7
+ generate_fake_image,
8
+ generate_fake_text,
9
+ replace_text,
10
+ )
11
+ from src.application.url_reader import URLReader
12
+
13
+ AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
14
+ AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
15
+
16
+
17
+ def load_url(url):
18
+ """
19
+ Load content from the given URL.
20
+ """
21
+ content = URLReader(url)
22
+ image = None
23
+ header = {
24
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", # noqa: E501
25
+ }
26
+ try:
27
+ response = requests.get(
28
+ url,
29
+ headers=header,
30
+ stream=True,
31
+ )
32
+ response.raise_for_status() # Raise an exception for bad status codes
33
+
34
+ image_response = requests.get(content.top_image, stream=True)
35
+ try:
36
+ image = Image.open(image_response.raw)
37
+ except OSError as e:
38
+ print(f"Error loading image from {content.top_image}: {e}")
39
+
40
+ except (requests.exceptions.RequestException, FileNotFoundError) as e:
41
+ print(f"Error fetching image: {e}")
42
+
43
+ return content.title, content.text, image
44
+
45
+
46
+ def generate_analysis_report(
47
+ news_title: str,
48
+ news_content: str,
49
+ news_image: Image,
50
+ ):
51
+ news_analysis = NewsVerification()
52
+ news_analysis.load_news(news_title, news_content, news_image)
53
+ news_analysis.generate_analysis_report()
54
+ return news_analysis.analyze_details()
55
+
56
+
57
+ # Define the GUI
58
+ with gr.Blocks() as demo:
59
+ gr.Markdown("# NEWS VERIFICATION")
60
+
61
+ with gr.Row():
62
+ # SETTINGS
63
+ with gr.Column(scale=1):
64
+ with gr.Accordion("1. Enter a URL"):
65
+ url_input = gr.Textbox(
66
+ label="",
67
+ show_label=False,
68
+ value="",
69
+ )
70
+ load_button = gr.Button("Load URL")
71
+
72
+ with gr.Accordion(
73
+ "2. Select content-generation models",
74
+ open=True,
75
+ visible=False,
76
+ ):
77
+ with gr.Row():
78
+ text_generation_model = gr.Dropdown(
79
+ choices=AZURE_TEXT_MODEL,
80
+ label="Text-generation model",
81
+ )
82
+ image_generation_model = gr.Dropdown(
83
+ choices=AZURE_IMAGE_MODEL,
84
+ label="Image-generation model",
85
+ )
86
+ generate_text_button = gr.Button("Generate text")
87
+ generate_image_button = gr.Button("Generate image")
88
+
89
+ with gr.Accordion(
90
+ "3. Replace any terms",
91
+ open=True,
92
+ visible=False,
93
+ ):
94
+ replace_df = gr.Dataframe(
95
+ headers=["Find what:", "Replace with:"],
96
+ datatype=["str", "str"],
97
+ row_count=(1, "dynamic"),
98
+ col_count=(2, "fixed"),
99
+ interactive=True,
100
+ )
101
+ replace_button = gr.Button("Replace all")
102
+
103
+ # GENERATED CONTENT
104
+ with gr.Accordion("Input News"):
105
+ news_title = gr.Textbox(label="Title", value="")
106
+ news_image = gr.Image(label="Image", type="filepath")
107
+ news_content = gr.Textbox(label="Content", value="", lines=13)
108
+
109
+ # NEWS ANALYSIS REPORT
110
+ ordinary_user_explanation = """
111
+ FOR ORDINARY USER<br>
112
+ - Green texts are the matched words in the input and source news.<br>
113
+ - Each highlighted pair (marked with a number) shows the key differences
114
+ between the input text and the source.
115
+ """
116
+ fact_checker_explanation = """
117
+ FOR FACT CHECKER<br>
118
+ - Green texts are the matched words in the input and source news.<br>
119
+ - Each highlighted pair (marked with a number) shows the key differences
120
+ between the input text and the source.
121
+ """
122
+ governor_explanation = """
123
+ FOR GOVERNOR<br>
124
+ - Green texts are the matched words in the input and source news.<br>
125
+ - Each highlighted pair (marked with a number) shows the key differences
126
+ between the input text and the source.
127
+ """
128
+ table = """
129
+ <h5>Comparison between input news and source news:</h5>
130
+ <table border="1" style="width:100%; text-align:left;">
131
+ <col style="width: 170px;">
132
+ <col style="width: 170px;">
133
+ <col style="width: 30px;">
134
+ <col style="width: 75px;">
135
+ <thead>
136
+ <tr>
137
+ <th>Input news</th>
138
+ <th>Source (corresponding URL provided in Originality)</th>
139
+ <th>Forensic</th>
140
+ <th>Originality</th>
141
+ </tr>
142
+ </thead>
143
+ <tbody>
144
+ <tr>
145
+ <th>TBD</th>
146
+ <th>TBD</th>
147
+ <th>TBD</th>
148
+ <th>TBD</th>
149
+ </tr>
150
+ </tbody>
151
+ </table>
152
+
153
+ <style>"""
154
+ with gr.Column(scale=2):
155
+ with gr.Accordion("NEWS ANALYSIS"):
156
+ verification_button = gr.Button("Verify news")
157
+ with gr.Tab("Orinary User"):
158
+ gr.HTML(ordinary_user_explanation)
159
+ ordinary_user_result = gr.HTML(table)
160
+ with gr.Tab("Fact Checker"):
161
+ gr.HTML(fact_checker_explanation)
162
+ fact_checker_result = gr.HTML(table)
163
+ with gr.Tab("Governor"):
164
+ gr.HTML(governor_explanation)
165
+ governor_result = gr.HTML(table)
166
+
167
+ # Connect events
168
+ load_button.click(
169
+ load_url,
170
+ inputs=url_input,
171
+ outputs=[news_title, news_content, news_image],
172
+ )
173
+ replace_button.click(
174
+ replace_text,
175
+ inputs=[news_title, news_content, replace_df],
176
+ outputs=[news_title, news_content],
177
+ )
178
+ generate_text_button.click(
179
+ generate_fake_text,
180
+ inputs=[text_generation_model, news_title, news_content],
181
+ outputs=[news_title, news_content],
182
+ )
183
+ generate_image_button.click(
184
+ generate_fake_image,
185
+ inputs=[image_generation_model, news_title],
186
+ outputs=[news_image],
187
+ )
188
+ verification_button.click(
189
+ generate_analysis_report,
190
+ inputs=[news_title, news_content, news_image],
191
+ outputs=[ordinary_user_result, fact_checker_result, governor_result],
192
+ )
193
+
194
+ # change Image
195
+ # url_input.change(load_image, inputs=url_input, outputs=image_view)
196
+
197
+ try:
198
+ with open(
199
+ "examples/example_text_real.txt",
200
+ encoding="utf-8",
201
+ ) as file:
202
+ text_real_1 = file.read()
203
+ with open(
204
+ "examples/example_text_real_2.txt",
205
+ encoding="utf-8",
206
+ ) as file:
207
+ text_real_2 = file.read()
208
+ with open(
209
+ "examples/example_text_LLM_topic.txt",
210
+ encoding="utf-8",
211
+ ) as file:
212
+ text_llm_topic = file.read()
213
+ with open(
214
+ "examples/example_text_LLM_modification.txt",
215
+ encoding="utf-8",
216
+ ) as file:
217
+ text_llm_modification = file.read()
218
+ with open(
219
+ "examples/example_text_LLM_entities.txt",
220
+ encoding="utf-8",
221
+ ) as file:
222
+ text_llm_entities = file.read()
223
+ except FileNotFoundError:
224
+ print("File not found.")
225
+ except Exception as e:
226
+ print(f"An error occurred: {e}")
227
+
228
+ title_1 = "Southampton news: Leeds target striker Cameron Archer."
229
+ title_2 = "Southampton news: Leeds target striker Cameron Archer."
230
+ title_4 = "Japan pledges support for Ukraine with 100-year pact."
231
+
232
+ image_1 = "examples/example_image_real_1.jpg.webp"
233
+ image_2 = "examples/example_image_real_2.jpg.webp"
234
+ image_3 = "examples/example_image_real_3.jpg"
235
+ image_4 = "examples/example_image_real_4.jpg.webp"
236
+
237
+ gr.Examples(
238
+ examples=[
239
+ [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
240
+ [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
241
+ [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
242
+ [title_4, image_4, text_llm_entities],
243
+ ],
244
+ inputs=[news_title, news_image, news_content],
245
+ label="Examples",
246
+ example_labels=[
247
+ "2 real news",
248
+ "1 real news + 1 LLM modification-based news",
249
+ "1 real news + 1 LLM topic-based news",
250
+ "1 LLM changed-entities news",
251
+ ],
252
+ )
253
+
254
+ demo.launch(share=True)
examples/example_text_LLM_entities.txt CHANGED
@@ -1 +1 @@
1
- Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
 
1
+ Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
examples/example_text_LLM_modification.txt CHANGED
@@ -1,3 +1,3 @@
1
- Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for Β£8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
2
- He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
3
- Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.
 
1
+ Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for Β£8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
2
+ He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
3
+ Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.
examples/example_text_LLM_topic.txt CHANGED
@@ -1,6 +1,6 @@
1
- The January transfer window is in full swing, with clubs across the globe scrambling to strengthen their squads for the remainder of the season.
2
- Premier League giants Manchester City have reportedly made a substantial bid for highly-rated midfielder Enzo Fernandez.
3
- Meanwhile, struggling Serie A side Sampdoria are looking to bolster their attack with the loan signing of veteran striker Fabio Quagliarella.
4
- Rumors are swirling around a potential move for Brazilian wonderkid Endrick to Real Madrid.
5
- The transfer window officially closes on January 31st, leaving clubs with limited time to finalize their deals.
6
- Fans are eagerly awaiting to see which teams make the shrewdest moves in this crucial period.
 
1
+ The January transfer window is in full swing, with clubs across the globe scrambling to strengthen their squads for the remainder of the season.
2
+ Premier League giants Manchester City have reportedly made a substantial bid for highly-rated midfielder Enzo Fernandez.
3
+ Meanwhile, struggling Serie A side Sampdoria are looking to bolster their attack with the loan signing of veteran striker Fabio Quagliarella.
4
+ Rumors are swirling around a potential move for Brazilian wonderkid Endrick to Real Madrid.
5
+ The transfer window officially closes on January 31st, leaving clubs with limited time to finalize their deals.
6
+ Fans are eagerly awaiting to see which teams make the shrewdest moves in this crucial period.
examples/example_text_real.txt CHANGED
@@ -2,4 +2,4 @@ Leeds are targeting a move for Southampton striker Cameron Archer with early tal
2
 
3
  It is unclear whether a deal can be achieved but the 23-year-old is open to a move before deadline day.
4
 
5
- Other options are believed to be on the table as Archer seeks a guaranteed starting role after increasingly finding himself on the bench under recently appointed Saints manager Ivan Juric.
 
2
 
3
  It is unclear whether a deal can be achieved but the 23-year-old is open to a move before deadline day.
4
 
5
+ Other options are believed to be on the table as Archer seeks a guaranteed starting role after increasingly finding himself on the bench under recently appointed Saints manager Ivan Juric.
examples/example_text_real_2.txt CHANGED
@@ -4,4 +4,4 @@ The resignation brings a long political chapter to an end. Trudeau has been in o
4
 
5
  Trudeau said he will remain at the helm until a new Liberal leader is selected.
6
 
7
- But many questions remain for the party, including who will take over and how they will manage a looming federal election. So what happens next?
 
4
 
5
  Trudeau said he will remain at the helm until a new Liberal leader is selected.
6
 
7
+ But many questions remain for the party, including who will take over and how they will manage a looming federal election. So what happens next?
gpt_test.py CHANGED
@@ -1,34 +1,30 @@
1
  import os
 
2
  from dotenv import load_dotenv
3
  from openai import AzureOpenAI
 
4
  load_dotenv()
5
- AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
6
- AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
7
- AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
8
 
9
  azure_client = AzureOpenAI(
10
- azure_endpoint = "https://quoc-nguyen.openai.azure.com/",
11
- api_key=AZURE_OPENAI_API_KEY,
12
- api_version="2024-05-01-preview"
13
  )
14
-
15
- deplopment_name = "o1-mini" # or "gpt-4o"
16
  TEXT_PROMPT = """
17
  replace Ukraine with Denmark:
18
 
19
- "Sir Keir Starmer has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country.
20
-
21
- The prime minister's visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems.
22
-
23
- Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back".
24
-
25
- An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east.
26
-
27
- Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid."
28
  """
29
-
30
  response = azure_client.chat.completions.create(
31
- model=deplopment_name, # model = "deployment_name".
32
  messages=[
33
  # {"role": "system", "content": "You are a helpful assistant."},
34
  {"role": "user", "content": TEXT_PROMPT},
@@ -36,4 +32,4 @@ response = azure_client.chat.completions.create(
36
  # max_tokens=512,
37
  # temperature=0,
38
  )
39
- print(response.choices[0].message.content)
 
1
  import os
2
+
3
  from dotenv import load_dotenv
4
  from openai import AzureOpenAI
5
+
6
  load_dotenv()
7
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
8
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
9
+ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
10
 
11
  azure_client = AzureOpenAI(
12
+ azure_endpoint="https://quoc-nguyen.openai.azure.com/",
13
+ api_key=AZURE_OPENAI_API_KEY,
14
+ api_version="2024-05-01-preview",
15
  )
16
+
17
+ deplopment_name = "o1-mini" # or "gpt-4o"
18
  TEXT_PROMPT = """
19
  replace Ukraine with Denmark:
20
 
21
+ "Sir Keir Starmer has pledged to put Ukraine in the "strongest
22
+ possible position" on a trip to Kyiv where he signed a
23
+ "landmark" 100-year pact with the war-stricken country.
 
 
 
 
 
 
24
  """
25
+
26
  response = azure_client.chat.completions.create(
27
+ model=deplopment_name, # model = "deployment_name".
28
  messages=[
29
  # {"role": "system", "content": "You are a helpful assistant."},
30
  {"role": "user", "content": TEXT_PROMPT},
 
32
  # max_tokens=512,
33
  # temperature=0,
34
  )
35
+ print(response.choices[0].message.content)
requirements.txt CHANGED
@@ -28,4 +28,4 @@ pytorch_lightning
28
  torchvision
29
  torch
30
  lightning
31
- timm
 
28
  torchvision
29
  torch
30
  lightning
31
+ timm
src/application/content_detection.py CHANGED
@@ -1,49 +1,63 @@
1
  from difflib import SequenceMatcher
2
 
3
  import pandas as pd
4
- from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
5
- from src.application.text.entity import apply_highlight, highlight_entities
 
 
 
 
 
 
 
 
6
  from src.application.text.helper import extract_equal_text
7
  from src.application.text.model_detection import detect_text_by_ai_model
8
  from src.application.text.preprocessing import split_into_paragraphs
9
- from src.application.text.search_detection import check_human, detect_text_by_relative_search, find_text_source
 
 
 
 
10
 
11
 
12
- class NewsVerification():
13
  def __init__(self):
14
  self.news_text = ""
15
  self.news_title = ""
16
  self.news_content = ""
17
  self.news_image = ""
18
-
19
- self.text_prediction_label:list[str] = []
20
- self.text_prediction_score:list[float] = []
21
- self.text_referent_url:list[str] = []
22
- self.image_prediction_label:list[str] = []
23
- self.image_prediction_score:list[str] = []
24
- self.image_referent_url:list[str] = []
25
  self.news_prediction_label = ""
26
  self.news_prediction_score = -1
27
-
28
- self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
29
- self.aligned_sentences:list[dict] = []
30
- self.aligned_sentences_df:pd.DataFrame = pd.DataFrame(columns=[
31
- "input_sentence",
32
- "matched_sentence",
33
- "label",
34
- "similarity",
35
- "paraphrase",
36
- "url",
37
- "group",
38
- "entities",
39
- ])
40
- self.is_paraphrased:list[bool] = []
41
-
42
- self.ordinary_user_table:list = []
43
- self.fact_checker_table:list = []
44
- self.governor_table:list = []
 
 
45
  self.entities_with_colors = []
46
-
47
  def load_news(self, news_title, news_content, news_image):
48
  self.news_text = news_title + "\n\n" + news_content
49
  self.news_title = news_title
@@ -52,13 +66,14 @@ class NewsVerification():
52
 
53
  def determine_text_origin(self):
54
  """
55
- Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
 
56
 
57
  Args:
58
  text: The input text to be analyzed.
59
 
60
  Returns:
61
- str: The predicted origin of the text:
62
  - "HUMAN": If the text is likely written by a human.
63
  - "MACHINE": If the text is likely generated by a machine.
64
  """
@@ -75,7 +90,7 @@ class NewsVerification():
75
  "similarity": None,
76
  "paraphrase": False,
77
  "url": "",
78
- }
79
 
80
  for index, sentence in enumerate(input_sentences):
81
  print(f"-------index = {index}-------")
@@ -83,10 +98,20 @@ class NewsVerification():
83
 
84
  if current_index >= len(input_sentences):
85
  break
86
- if current_index > index and index != 0 and index != len(input_sentences) - 1:
 
 
 
 
87
  continue
88
-
89
- paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
 
 
 
 
 
 
90
 
91
  if paraphrase is False:
92
  # add sentence to ai_sentence
@@ -95,19 +120,27 @@ class NewsVerification():
95
  ai_sentence["input_sentence"] += sentence
96
  if index == len(input_sentences) - 1:
97
  # add ai_sentences to align_sentences
98
- text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
 
 
99
  ai_sentence["label"] = text_prediction_label
100
  ai_sentence["similarity"] = text_prediction_score
101
  self.aligned_sentences.append(ai_sentence)
102
  else:
103
  if previous_paraphrase is False or previous_paraphrase is None:
104
  # add ai_sentences to align_sentences
105
- if ai_sentence["input_sentence"] != "" or current_index >= len(input_sentences):
106
- text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
 
 
 
 
 
 
107
  ai_sentence["label"] = text_prediction_label
108
  ai_sentence["similarity"] = text_prediction_score
109
  self.aligned_sentences.append(ai_sentence)
110
-
111
  # reset
112
  ai_sentence = {
113
  "input_sentence": "",
@@ -116,7 +149,7 @@ class NewsVerification():
116
  "similarity": None,
117
  "paraphrase": False,
118
  "url": "",
119
- }
120
 
121
  # add searched_sentences to align_sentences
122
  if searched_sentences["input_sentence"] != "":
@@ -125,20 +158,21 @@ class NewsVerification():
125
  searched_sentences["label"] = "HUMAN"
126
  else:
127
  searched_sentences["label"] = "MACHINE"
128
-
129
  self.aligned_sentences.append(searched_sentences)
130
 
131
  previous_paraphrase = paraphrase
132
 
133
  def determine_text_origin_2(self):
134
  """
135
- Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
 
136
 
137
  Args:
138
  text: The input text to be analyzed.
139
 
140
  Returns:
141
- str: The predicted origin of the text:
142
  - "HUMAN": If the text is likely written by a human.
143
  - "MACHINE": If the text is likely generated by a machine.
144
  """
@@ -150,17 +184,17 @@ class NewsVerification():
150
  self.aligned_sentences_df = pd.concat(
151
  [self.aligned_sentences_df, pd.DataFrame([{}])],
152
  ignore_index=False,
153
- )
154
 
155
  for index, sentence in enumerate(input_sentences):
156
  print(f"-------index = {index}-------")
157
  print(f"current_sentence = {input_sentences[index]}")
158
-
159
  if self.aligned_sentences_df["url"] is not None:
160
  continue
161
 
162
  self.aligned_sentences_df, img_urls = find_text_source(
163
- input_sentences[index],
164
  self.aligned_sentences_df,
165
  )
166
 
@@ -171,25 +205,30 @@ class NewsVerification():
171
  self.image_prediction_score = 0.0
172
  self.image_referent_url = None
173
  return
174
-
175
  for image in self.found_img_url:
176
- print(f"\tfound_img_url: {image}")
177
- matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
 
 
 
178
  if matched_url is not None:
179
  print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
180
  self.image_prediction_label = "HUMAN"
181
  self.image_prediction_score = similarity
182
  self.image_referent_url = matched_url
183
  return
184
-
185
- matched_url, similarity = detect_image_by_reverse_search(self.news_image)
 
 
186
  if matched_url is not None:
187
  print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
188
  self.image_prediction_label = "HUMAN"
189
  self.image_prediction_score = similarity
190
  self.image_referent_url = matched_url
191
  return
192
-
193
  detected_label, score = detect_image_by_ai_model(self.news_image)
194
  if detected_label:
195
  print(f"detected_label: {detected_label} ({score})")
@@ -197,7 +236,7 @@ class NewsVerification():
197
  self.image_prediction_score = score
198
  self.image_referent_url = None
199
  return
200
-
201
  self.image_prediction_label = "UNKNOWN"
202
  self.image_prediction_score = 50
203
  self.image_referent_url = None
@@ -209,15 +248,17 @@ class NewsVerification():
209
  text_prediction_score = 50
210
  else:
211
  text_prediction_score = self.text_prediction_score
212
-
213
  if self.image_prediction_label == "MACHINE":
214
  image_prediction_score = 100 - self.image_prediction_score
215
  elif self.image_prediction_label == "UNKNOWN":
216
  image_prediction_score = 50
217
  else:
218
  image_prediction_score = self.image_prediction_score
219
-
220
- news_prediction_score = (text_prediction_score + image_prediction_score) / 2
 
 
221
  if news_prediction_score > 50:
222
  self.news_prediction_score = news_prediction_score
223
  self.news_prediction_label = "HUMAN"
@@ -234,37 +275,25 @@ class NewsVerification():
234
  for index, aligned_sentence in enumerate(self.aligned_sentences):
235
  # Get entity-words (in pair) with colors
236
  entities_with_colors = highlight_entities(
237
- aligned_sentence["input_sentence"],
238
- aligned_sentence["matched_sentence"],
239
- )
240
  self.aligned_sentences[index]["entities"] = entities_with_colors
241
-
242
  ordinary_user_table = self.create_ordinary_user_table()
243
  fact_checker_table = self.create_fact_checker_table()
244
  governor_table = self.create_governor_table()
245
 
246
  return ordinary_user_table, fact_checker_table, governor_table
247
-
248
  def get_text_urls(self):
249
  return set(self.text_referent_url)
250
 
251
-
252
  def compare_sentences(self, sentence_1, sentence_2, position, color):
253
  """
254
- Compares two sentences and identifies common phrases, outputting their start and end positions.
255
-
256
- Args:
257
- sentence_1: The first sentence (string).
258
- sentence_2: The second sentence (string).
259
 
260
- Returns:
261
- A list of dictionaries, where each dictionary represents a common phrase and contains:
262
- - "phrase": The common phrase (string).
263
- - "start_1": The starting index of the phrase in sentence_1 (int).
264
- - "end_1": The ending index of the phrase in sentence_1 (int).
265
- - "start_2": The starting index of the phrase in sentence_2 (int).
266
- - "end_2": The ending index of the phrase in sentence_2 (int).
267
- Returns an empty list if no common phrases are found. Handles edge cases like empty strings.
268
  """
269
 
270
  if not sentence_1 or not sentence_2: # Handle empty strings
@@ -280,16 +309,20 @@ class NewsVerification():
280
  start_2 = block.b
281
  end_2 = block.b + block.size
282
 
283
- phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same
284
-
285
- common_phrases.append({
286
- "phrase": phrase,
287
- "start_1": start_1 + position,
288
- "end_1": end_1 + position,
289
- "start_2": start_2,
290
- "end_2": end_2,
291
- "color": color,
292
- })
 
 
 
 
293
  position += len(sentence_1)
294
  return common_phrases, position
295
 
@@ -297,17 +330,17 @@ class NewsVerification():
297
  rows = []
298
  max_length = 30 # TODO: put this in configuration
299
  rows.append(self.format_image_fact_checker_row(max_length))
300
-
301
  for aligned_sentence in self.aligned_sentences:
302
  if "input_sentence" not in aligned_sentence:
303
  continue
304
-
305
  # Get index of equal phrases in input and source sentences
306
  equal_idx_1, equal_idx_2 = extract_equal_text(
307
- aligned_sentence["input_sentence"],
308
- aligned_sentence["matched_sentence"],
309
- )
310
-
311
  # Get entity-words (in pair) with colors
312
  # entities_with_colors = highlight_entities(
313
  # aligned_sentence["input_sentence"],
@@ -320,32 +353,35 @@ class NewsVerification():
320
  equal_idx_1,
321
  equal_idx_2,
322
  aligned_sentence["entities"],
323
- ]
324
  )
325
 
326
  for row in self.fact_checker_table:
327
  formatted_row = self.format_text_fact_checker_row(row, max_length)
328
  rows.append(formatted_row)
329
-
330
  table = "\n".join(rows)
331
  return f"""
332
- <h5>Comparison between input news and source news:</h5>
333
- <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
334
- <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
335
- <thead>
336
- <tr>
337
- <th>Input news</th>
338
- <th>Source (URL provided in Originality column correspondingly)</th>
339
- <th>Forensic</th>
340
- <th>Originality</th>
341
- </tr>
342
- </thead>
343
- <tbody>
344
- {table}
345
- </tbody>
346
- </table>
347
-
348
- <style>
 
 
 
349
  """
350
 
351
  def format_text_fact_checker_row(self, row, max_length=30):
@@ -354,50 +390,76 @@ class NewsVerification():
354
  return ""
355
  if row[0]["matched_sentence"] != "": # source is not empty
356
  # highlight entities
357
- input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
358
- source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
 
 
 
 
 
 
 
 
359
  entity_count = len(row[3])
360
-
361
  # Color overlapping words
362
- input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
363
- source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source) # text, index of highlight words
364
-
365
- input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
366
- source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  else:
368
  input_sentence = row[0]["input_sentence"]
369
  source_sentence = row[0]["matched_sentence"]
370
 
371
  label = row[0]["label"]
372
  score = row[0]["similarity"]
373
-
374
- url = row[0]["url"] #
375
  short_url = self.shorten_url(url, max_length)
376
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
377
-
378
  entity_count_text = self.get_entity_count_text(entity_count)
379
-
380
  return f"""
381
  <tr>
382
  <td>{input_sentence}</td>
383
  <td>{source_sentence}</td>
384
- <td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
385
  <td>{source_text_url}</td>
386
  </tr>
387
  """
388
 
389
- def format_image_fact_checker_row(self, max_length=30):
390
-
391
- if self.image_referent_url is not None or self.image_referent_url != "":
392
- source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
 
 
 
393
  short_url = self.shorten_url(self.image_referent_url, max_length)
394
- source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
 
 
395
  else:
396
  source_image = "Image not found"
397
  source_image_url = ""
398
 
399
- return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
400
-
401
 
402
  def create_ordinary_user_table(self):
403
  rows = []
@@ -405,24 +467,27 @@ class NewsVerification():
405
  rows.append(self.format_image_ordinary_user_row(max_length))
406
  rows.append(self.format_text_ordinary_user_row(max_length))
407
  table = "\n".join(rows)
408
-
409
  return f"""
410
- <h5>Comparison between input news and source news:</h5>
411
- <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
412
- <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
413
- <thead>
414
- <tr>
415
- <th>Input news</th>
416
- <th>Forensic</th>
417
- <th>Originality</th>
418
- </tr>
419
- </thead>
420
- <tbody>
421
- {table}
422
- </tbody>
423
- </table>
424
-
425
- <style>
 
 
 
426
  """
427
 
428
  def format_text_ordinary_user_row(self, max_length=30):
@@ -436,152 +501,184 @@ class NewsVerification():
436
  continue
437
  input_sentences += row["input_sentence"] + "<br><br>"
438
  label = self.aligned_sentences[index]["label"]
439
-
440
- url = self.aligned_sentences[index]["url"] #
441
  short_url = self.shorten_url(url, max_length)
442
  source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
443
  sentence_count += 1
444
-
445
  scores, label = self.calculate_score_label()
446
-
447
  return f"""
448
  <tr>
449
  <td>{input_sentences}</td>
450
- <td>{label}<br>({scores*100:.2f}%)</td>
451
  <td>{source_text_urls}</td>
452
  </tr>
453
  """
454
 
455
- def format_image_ordinary_user_row(self, max_length=30):
456
-
457
- if self.image_referent_url is not None or self.image_referent_url != "":
458
- # source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
 
 
459
  short_url = self.shorten_url(self.image_referent_url, max_length)
460
- source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
 
 
461
  else:
462
  # source_image = "Image not found"
463
  source_image_url = ""
464
 
465
- return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
466
-
467
 
468
  def create_governor_table(self):
469
  rows = []
470
  max_length = 30 # TODO: put this in configuration
471
  rows.append(self.format_image_governor_row(max_length))
472
-
473
  for aligned_sentence in self.aligned_sentences:
474
  if "input_sentence" not in aligned_sentence:
475
  continue
476
-
477
  # Get index of equal phrases in input and source sentences
478
  equal_idx_1, equal_idx_2 = extract_equal_text(
479
- aligned_sentence["input_sentence"],
480
- aligned_sentence["matched_sentence"],
481
- )
482
-
483
  # Get entity-words (in pair) with colors
484
  # entities_with_colors = highlight_entities(
485
  # aligned_sentence["input_sentence"],
486
  # aligned_sentence["matched_sentence"],
487
  # )
488
-
489
  self.governor_table.append(
490
  [
491
  aligned_sentence,
492
  equal_idx_1,
493
  equal_idx_2,
494
  aligned_sentence["entities"],
495
- ]
496
  )
497
 
498
  formatted_row = self.format_text_governor_row(max_length)
499
  rows.append(formatted_row)
500
-
501
  table = "\n".join(rows)
502
  return f"""
503
- <h5>Comparison between input news and source news:</h5>
504
- <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
505
- <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
506
- <thead>
507
- <tr>
508
- <th>Input news</th>
509
- <th>Source (URL provided in Originality column correspondingly)</th>
510
- <th>Forensic</th>
511
- <th>Originality</th>
512
- </tr>
513
- </thead>
514
- <tbody>
515
- {table}
516
- </tbody>
517
- </table>
518
-
519
- <style>
 
 
 
520
  """
521
 
522
- def format_text_governor_row(self, max_length=30):
523
  input_sentences = ""
524
  source_sentences = ""
525
  source_text_urls = ""
526
  label = ""
527
- scores = 0
528
  sentence_count = 0
529
  entity_count = 0
530
  for row in self.governor_table:
531
  print(f"governor_row: {row}")
532
  if row[0]["input_sentence"] == "":
533
  continue
534
-
535
  if row[0]["matched_sentence"] != "": # source is not empty
536
  # highlight entities
537
- input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input", entity_count)
538
- source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source", entity_count)
 
 
 
 
 
 
 
 
 
 
539
  entity_count += len(row[3])
540
-
541
  # Color overlapping words
542
- input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
543
- source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source) # text, index of highlight words
544
-
545
- input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
546
- source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
 
548
  else:
549
  input_sentence = row[0]["input_sentence"]
550
  source_sentence = row[0]["matched_sentence"]
551
-
552
- # convert score to HUMAN-based score:
553
  input_sentences += input_sentence + "<br><br>"
554
  source_sentences += source_sentence + "<br><br>"
555
-
556
-
557
  url = row[0]["url"]
558
  short_url = self.shorten_url(url, max_length)
559
  source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
560
  sentence_count += 1
561
-
562
  score, label = self.calculate_score_label()
563
  entity_count_text = self.get_entity_count_text(entity_count)
564
 
565
  return f"""
566
- <tr>
567
- <td>{input_sentences}</td>
568
- <td>{source_sentences}</td>
569
- <td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
570
- <td>{source_text_urls}</td>
571
- </tr>
572
  """
573
 
574
  def format_image_governor_row(self, max_length=30):
575
- if self.image_referent_url is not None or self.image_referent_url != "":
576
- source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
 
 
 
577
  short_url = self.shorten_url(self.image_referent_url, max_length)
578
- source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
 
 
579
  else:
580
  source_image = "Image not found"
581
  source_image_url = ""
582
 
583
- return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
584
-
585
 
586
  def get_entity_count_text(self, entity_count):
587
  if entity_count <= 0:
@@ -595,52 +692,51 @@ class NewsVerification():
595
  def shorten_url(self, url, max_length=30):
596
  if url is None:
597
  return ""
598
-
599
  if len(url) > max_length:
600
  short_url = url[:max_length] + "..."
601
  else:
602
  short_url = url
603
  return short_url
604
 
605
-
606
  def color_text(self, text, colored_idx, highlighted_idx):
607
  paragraph = ""
608
  words = text.split()
609
-
610
  starts, ends = self.extract_starts_ends(colored_idx)
611
  starts, ends = self.filter_indices(starts, ends, highlighted_idx)
612
 
613
  previous_end = 0
614
  for start, end in zip(starts, ends):
615
  paragraph += " ".join(words[previous_end:start])
616
-
617
  equal_words = " ".join(words[start:end])
618
  paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
619
-
620
  previous_end = end
621
-
622
- # Some left words due to the punctuation separated from
623
  # the highlighting text
624
  equal_words = " ".join(words[previous_end:])
625
  print(f"starts_2: {previous_end}")
626
- print(f"ends_2: {len(words)-1}")
627
  print(f"equal_words: {words[previous_end:]}")
628
  paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
629
 
630
  return paragraph
631
-
632
  def extract_starts_ends(self, colored_idx):
633
  starts = []
634
  ends = []
635
  for index in colored_idx:
636
- starts.append(index['start'])
637
- ends.append(index['end'])
638
  return starts, ends
639
-
640
-
641
  def filter_indices(self, starts, ends, ignore_indices):
642
  """
643
- Filters start and end indices to exclude any indices present in the ignore_indices list.
 
644
 
645
  Args:
646
  starts: A list of starting indices.
@@ -648,23 +744,26 @@ class NewsVerification():
648
  ignore_indices: A list of indices to exclude.
649
 
650
  Returns:
651
- A tuple containing two new lists: filtered_starts and filtered_ends.
652
- Returns empty lists if the input is invalid or if all ranges are filtered out.
 
653
  Prints error messages for invalid input.
654
-
655
  Examples:
656
  starts = [0, 5, 10]
657
  ends = [3, 7, 12]
658
  ignore_indices = [1, 2, 11, 17]
659
-
660
- # Output:
661
  starts = [0, 3, 5, 10, 12]
662
  ends = [0, 3, 7, 10, 12]
663
 
664
  """
665
 
666
  if len(starts) != len(ends):
667
- print("Error: The 'starts' and 'ends' lists must have the same length.")
 
 
668
  return [], []
669
 
670
  filtered_starts = []
@@ -675,10 +774,11 @@ class NewsVerification():
675
  end = ends[i]
676
 
677
  if end < start:
678
- print(f"Error: End index {end} is less than start index {start} at position {i}.")
 
 
679
  return [], []
680
 
681
-
682
  start_end = list(range(start, end + 1, 1))
683
  start_end = list(set(start_end) - set(ignore_indices))
684
  new_start, new_end = self.extract_sequences(start_end)
@@ -690,7 +790,7 @@ class NewsVerification():
690
  def extract_sequences(self, numbers):
691
  if len(numbers) == 1:
692
  return [numbers[0]], [numbers[0]]
693
-
694
  numbers.sort()
695
  starts = []
696
  ends = []
@@ -699,21 +799,21 @@ class NewsVerification():
699
  start = number
700
  end = number
701
  continue
702
-
703
- if number - 1 == numbers[i-1]:
704
  end = number
705
  else:
706
  starts.append(start)
707
  ends.append(end + 1)
708
  start = number
709
  end = number
710
-
711
  if i == len(numbers) - 1:
712
  starts.append(start)
713
  ends.append(end + 1)
714
-
715
  return starts, ends
716
-
717
  def calculate_score_label(self):
718
  human_score = []
719
  machine_score = []
@@ -726,7 +826,7 @@ class NewsVerification():
726
  elif sentence["label"] == "MACHINE":
727
  machine_score.append(1 - sentence["similarity"])
728
  machine_flag = True
729
-
730
  if machine_flag is True and len(machine_score) > 0:
731
  # average value of machine_score
732
  machine_score_avg = sum(machine_score) / len(machine_score)
@@ -739,5 +839,3 @@ class NewsVerification():
739
  return human_score_avg, "HUMAN"
740
  else:
741
  return 0, "UNKNOWN"
742
-
743
-
 
1
  from difflib import SequenceMatcher
2
 
3
  import pandas as pd
4
+
5
+ from src.application.image.image_detection import (
6
+ detect_image_by_ai_model,
7
+ detect_image_by_reverse_search,
8
+ detect_image_from_news_image,
9
+ )
10
+ from src.application.text.entity import (
11
+ apply_highlight,
12
+ highlight_entities,
13
+ )
14
  from src.application.text.helper import extract_equal_text
15
  from src.application.text.model_detection import detect_text_by_ai_model
16
  from src.application.text.preprocessing import split_into_paragraphs
17
+ from src.application.text.search_detection import (
18
+ check_human,
19
+ detect_text_by_relative_search,
20
+ find_text_source,
21
+ )
22
 
23
 
24
+ class NewsVerification:
25
  def __init__(self):
26
  self.news_text = ""
27
  self.news_title = ""
28
  self.news_content = ""
29
  self.news_image = ""
30
+
31
+ self.text_prediction_label: list[str] = []
32
+ self.text_prediction_score: list[float] = []
33
+ self.text_referent_url: list[str] = []
34
+ self.image_prediction_label: list[str] = []
35
+ self.image_prediction_score: list[str] = []
36
+ self.image_referent_url: list[str] = []
37
  self.news_prediction_label = ""
38
  self.news_prediction_score = -1
39
+
40
+ self.found_img_url: list[str] = []
41
+ self.aligned_sentences: list[dict] = []
42
+ self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
43
+ columns=[
44
+ "input_sentence",
45
+ "matched_sentence",
46
+ "label",
47
+ "similarity",
48
+ "paraphrase",
49
+ "url",
50
+ "group",
51
+ "entities",
52
+ ],
53
+ )
54
+ self.is_paraphrased: list[bool] = []
55
+
56
+ self.ordinary_user_table: list = []
57
+ self.fact_checker_table: list = []
58
+ self.governor_table: list = []
59
  self.entities_with_colors = []
60
+
61
  def load_news(self, news_title, news_content, news_image):
62
  self.news_text = news_title + "\n\n" + news_content
63
  self.news_title = news_title
 
66
 
67
  def determine_text_origin(self):
68
  """
69
+ Determines the origin of the given text based on paraphrasing detection
70
+ and human authorship analysis.
71
 
72
  Args:
73
  text: The input text to be analyzed.
74
 
75
  Returns:
76
+ str: The predicted origin of the text:
77
  - "HUMAN": If the text is likely written by a human.
78
  - "MACHINE": If the text is likely generated by a machine.
79
  """
 
90
  "similarity": None,
91
  "paraphrase": False,
92
  "url": "",
93
+ }
94
 
95
  for index, sentence in enumerate(input_sentences):
96
  print(f"-------index = {index}-------")
 
98
 
99
  if current_index >= len(input_sentences):
100
  break
101
+ if (
102
+ current_index > index
103
+ and index != 0
104
+ and index != len(input_sentences) - 1
105
+ ):
106
  continue
107
+
108
+ (
109
+ paraphrase,
110
+ text_url,
111
+ searched_sentences,
112
+ img_urls,
113
+ current_index,
114
+ ) = detect_text_by_relative_search(input_sentences, index)
115
 
116
  if paraphrase is False:
117
  # add sentence to ai_sentence
 
120
  ai_sentence["input_sentence"] += sentence
121
  if index == len(input_sentences) - 1:
122
  # add ai_sentences to align_sentences
123
+ text_prediction_label, text_prediction_score = (
124
+ detect_text_by_ai_model(ai_sentence["input_sentence"])
125
+ )
126
  ai_sentence["label"] = text_prediction_label
127
  ai_sentence["similarity"] = text_prediction_score
128
  self.aligned_sentences.append(ai_sentence)
129
  else:
130
  if previous_paraphrase is False or previous_paraphrase is None:
131
  # add ai_sentences to align_sentences
132
+ if ai_sentence[
133
+ "input_sentence"
134
+ ] != "" or current_index >= len(input_sentences):
135
+ text_prediction_label, text_prediction_score = (
136
+ detect_text_by_ai_model(
137
+ ai_sentence["input_sentence"],
138
+ )
139
+ )
140
  ai_sentence["label"] = text_prediction_label
141
  ai_sentence["similarity"] = text_prediction_score
142
  self.aligned_sentences.append(ai_sentence)
143
+
144
  # reset
145
  ai_sentence = {
146
  "input_sentence": "",
 
149
  "similarity": None,
150
  "paraphrase": False,
151
  "url": "",
152
+ }
153
 
154
  # add searched_sentences to align_sentences
155
  if searched_sentences["input_sentence"] != "":
 
158
  searched_sentences["label"] = "HUMAN"
159
  else:
160
  searched_sentences["label"] = "MACHINE"
161
+
162
  self.aligned_sentences.append(searched_sentences)
163
 
164
  previous_paraphrase = paraphrase
165
 
166
  def determine_text_origin_2(self):
167
  """
168
+ Determines the origin of the given text based on paraphrasing detection
169
+ and human authorship analysis.
170
 
171
  Args:
172
  text: The input text to be analyzed.
173
 
174
  Returns:
175
+ str: The predicted origin of the text:
176
  - "HUMAN": If the text is likely written by a human.
177
  - "MACHINE": If the text is likely generated by a machine.
178
  """
 
184
  self.aligned_sentences_df = pd.concat(
185
  [self.aligned_sentences_df, pd.DataFrame([{}])],
186
  ignore_index=False,
187
+ )
188
 
189
  for index, sentence in enumerate(input_sentences):
190
  print(f"-------index = {index}-------")
191
  print(f"current_sentence = {input_sentences[index]}")
192
+
193
  if self.aligned_sentences_df["url"] is not None:
194
  continue
195
 
196
  self.aligned_sentences_df, img_urls = find_text_source(
197
+ input_sentences[index],
198
  self.aligned_sentences_df,
199
  )
200
 
 
205
  self.image_prediction_score = 0.0
206
  self.image_referent_url = None
207
  return
208
+
209
  for image in self.found_img_url:
210
+ print(f"\tfound_img_url: {image}")
211
+ matched_url, similarity = detect_image_from_news_image(
212
+ self.news_image,
213
+ self.found_img_url,
214
+ )
215
  if matched_url is not None:
216
  print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
217
  self.image_prediction_label = "HUMAN"
218
  self.image_prediction_score = similarity
219
  self.image_referent_url = matched_url
220
  return
221
+
222
+ matched_url, similarity = detect_image_by_reverse_search(
223
+ self.news_image,
224
+ )
225
  if matched_url is not None:
226
  print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
227
  self.image_prediction_label = "HUMAN"
228
  self.image_prediction_score = similarity
229
  self.image_referent_url = matched_url
230
  return
231
+
232
  detected_label, score = detect_image_by_ai_model(self.news_image)
233
  if detected_label:
234
  print(f"detected_label: {detected_label} ({score})")
 
236
  self.image_prediction_score = score
237
  self.image_referent_url = None
238
  return
239
+
240
  self.image_prediction_label = "UNKNOWN"
241
  self.image_prediction_score = 50
242
  self.image_referent_url = None
 
248
  text_prediction_score = 50
249
  else:
250
  text_prediction_score = self.text_prediction_score
251
+
252
  if self.image_prediction_label == "MACHINE":
253
  image_prediction_score = 100 - self.image_prediction_score
254
  elif self.image_prediction_label == "UNKNOWN":
255
  image_prediction_score = 50
256
  else:
257
  image_prediction_score = self.image_prediction_score
258
+
259
+ news_prediction_score = (
260
+ text_prediction_score + image_prediction_score
261
+ ) / 2
262
  if news_prediction_score > 50:
263
  self.news_prediction_score = news_prediction_score
264
  self.news_prediction_label = "HUMAN"
 
275
  for index, aligned_sentence in enumerate(self.aligned_sentences):
276
  # Get entity-words (in pair) with colors
277
  entities_with_colors = highlight_entities(
278
+ aligned_sentence["input_sentence"],
279
+ aligned_sentence["matched_sentence"],
280
+ )
281
  self.aligned_sentences[index]["entities"] = entities_with_colors
282
+
283
  ordinary_user_table = self.create_ordinary_user_table()
284
  fact_checker_table = self.create_fact_checker_table()
285
  governor_table = self.create_governor_table()
286
 
287
  return ordinary_user_table, fact_checker_table, governor_table
288
+
289
  def get_text_urls(self):
290
  return set(self.text_referent_url)
291
 
 
292
  def compare_sentences(self, sentence_1, sentence_2, position, color):
293
  """
294
+ Compares two sentences and identifies common phrases,
295
+ outputting their start and end positions.
 
 
 
296
 
 
 
 
 
 
 
 
 
297
  """
298
 
299
  if not sentence_1 or not sentence_2: # Handle empty strings
 
309
  start_2 = block.b
310
  end_2 = block.b + block.size
311
 
312
+ phrase = sentence_1[
313
+ start_1:end_1
314
+ ] # Or sentence_2[start_2:end_2], they are the same
315
+
316
+ common_phrases.append(
317
+ {
318
+ "phrase": phrase,
319
+ "start_1": start_1 + position,
320
+ "end_1": end_1 + position,
321
+ "start_2": start_2,
322
+ "end_2": end_2,
323
+ "color": color,
324
+ },
325
+ )
326
  position += len(sentence_1)
327
  return common_phrases, position
328
 
 
330
  rows = []
331
  max_length = 30 # TODO: put this in configuration
332
  rows.append(self.format_image_fact_checker_row(max_length))
333
+
334
  for aligned_sentence in self.aligned_sentences:
335
  if "input_sentence" not in aligned_sentence:
336
  continue
337
+
338
  # Get index of equal phrases in input and source sentences
339
  equal_idx_1, equal_idx_2 = extract_equal_text(
340
+ aligned_sentence["input_sentence"],
341
+ aligned_sentence["matched_sentence"],
342
+ )
343
+
344
  # Get entity-words (in pair) with colors
345
  # entities_with_colors = highlight_entities(
346
  # aligned_sentence["input_sentence"],
 
353
  equal_idx_1,
354
  equal_idx_2,
355
  aligned_sentence["entities"],
356
+ ],
357
  )
358
 
359
  for row in self.fact_checker_table:
360
  formatted_row = self.format_text_fact_checker_row(row, max_length)
361
  rows.append(formatted_row)
362
+
363
  table = "\n".join(rows)
364
  return f"""
365
+ <h5>Comparison between input news and source news:</h5>
366
+ <table border="1" style="width:100%; text-align:left;">
367
+ <col style="width: 170px;">
368
+ <col style="width: 170px;">
369
+ <col style="width: 30px;">
370
+ <col style="width: 75px;">
371
+ <thead>
372
+ <tr>
373
+ <th>Input news</th>
374
+ <th>Source (corresponding URL provided in Originality)</th>
375
+ <th>Forensic</th>
376
+ <th>Originality</th>
377
+ </tr>
378
+ </thead>
379
+ <tbody>
380
+ {table}
381
+ </tbody>
382
+ </table>
383
+
384
+ <style>
385
  """
386
 
387
  def format_text_fact_checker_row(self, row, max_length=30):
 
390
  return ""
391
  if row[0]["matched_sentence"] != "": # source is not empty
392
  # highlight entities
393
+ input_sentence, highlight_idx_input = apply_highlight(
394
+ row[0]["input_sentence"],
395
+ row[3],
396
+ "input",
397
+ )
398
+ source_sentence, highlight_idx_source = apply_highlight(
399
+ row[0]["matched_sentence"],
400
+ row[3],
401
+ "source",
402
+ )
403
  entity_count = len(row[3])
404
+
405
  # Color overlapping words
406
+ input_sentence = self.color_text(
407
+ input_sentence,
408
+ row[1],
409
+ highlight_idx_input,
410
+ ) # text, index of highlight words
411
+ source_sentence = self.color_text(
412
+ source_sentence,
413
+ row[2],
414
+ highlight_idx_source,
415
+ ) # text, index of highlight words
416
+
417
+ input_sentence = input_sentence.replace(
418
+ "span_style",
419
+ "span style",
420
+ ).replace("1px_4px", "1px 4px")
421
+ source_sentence = source_sentence.replace(
422
+ "span_style",
423
+ "span style",
424
+ ).replace("1px_4px", "1px 4px")
425
  else:
426
  input_sentence = row[0]["input_sentence"]
427
  source_sentence = row[0]["matched_sentence"]
428
 
429
  label = row[0]["label"]
430
  score = row[0]["similarity"]
431
+
432
+ url = row[0]["url"] #
433
  short_url = self.shorten_url(url, max_length)
434
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
435
+
436
  entity_count_text = self.get_entity_count_text(entity_count)
437
+
438
  return f"""
439
  <tr>
440
  <td>{input_sentence}</td>
441
  <td>{source_sentence}</td>
442
+ <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td> # noqa: E501
443
  <td>{source_text_url}</td>
444
  </tr>
445
  """
446
 
447
+ def format_image_fact_checker_row(self, max_length=30):
448
+
449
+ if (
450
+ self.image_referent_url is not None
451
+ or self.image_referent_url != ""
452
+ ):
453
+ source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">""" # noqa: E501
454
  short_url = self.shorten_url(self.image_referent_url, max_length)
455
+ source_image_url = (
456
+ f"""<a href="{self.image_referent_url}">{short_url}</a>"""
457
+ )
458
  else:
459
  source_image = "Image not found"
460
  source_image_url = ""
461
 
462
+ return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>""" # noqa: E501
 
463
 
464
  def create_ordinary_user_table(self):
465
  rows = []
 
467
  rows.append(self.format_image_ordinary_user_row(max_length))
468
  rows.append(self.format_text_ordinary_user_row(max_length))
469
  table = "\n".join(rows)
470
+
471
  return f"""
472
+ <h5>Comparison between input news and source news:</h5>
473
+ <table border="1" style="width:100%; text-align:left; border-collapse:collapse;"> # noqa: E501
474
+ <col style="width: 170px;">
475
+ <col style="width: 170px;">
476
+ <col style="width: 30px;">
477
+ <col style="width: 75px;">
478
+ <thead>
479
+ <tr>
480
+ <th>Input news</th>
481
+ <th>Forensic</th>
482
+ <th>Originality</th>
483
+ </tr>
484
+ </thead>
485
+ <tbody>
486
+ {table}
487
+ </tbody>
488
+ </table>
489
+
490
+ <style>
491
  """
492
 
493
  def format_text_ordinary_user_row(self, max_length=30):
 
501
  continue
502
  input_sentences += row["input_sentence"] + "<br><br>"
503
  label = self.aligned_sentences[index]["label"]
504
+
505
+ url = self.aligned_sentences[index]["url"] #
506
  short_url = self.shorten_url(url, max_length)
507
  source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
508
  sentence_count += 1
509
+
510
  scores, label = self.calculate_score_label()
511
+
512
  return f"""
513
  <tr>
514
  <td>{input_sentences}</td>
515
+ <td>{label}<br>({scores * 100:.2f}%)</td>
516
  <td>{source_text_urls}</td>
517
  </tr>
518
  """
519
 
520
+ def format_image_ordinary_user_row(self, max_length=30):
521
+
522
+ if (
523
+ self.image_referent_url is not None
524
+ or self.image_referent_url != ""
525
+ ):
526
  short_url = self.shorten_url(self.image_referent_url, max_length)
527
+ source_image_url = (
528
+ f"""<a href="{self.image_referent_url}">{short_url}</a>"""
529
+ )
530
  else:
531
  # source_image = "Image not found"
532
  source_image_url = ""
533
 
534
+ return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>""" # noqa: E501
 
535
 
536
  def create_governor_table(self):
537
  rows = []
538
  max_length = 30 # TODO: put this in configuration
539
  rows.append(self.format_image_governor_row(max_length))
540
+
541
  for aligned_sentence in self.aligned_sentences:
542
  if "input_sentence" not in aligned_sentence:
543
  continue
544
+
545
  # Get index of equal phrases in input and source sentences
546
  equal_idx_1, equal_idx_2 = extract_equal_text(
547
+ aligned_sentence["input_sentence"],
548
+ aligned_sentence["matched_sentence"],
549
+ )
550
+
551
  # Get entity-words (in pair) with colors
552
  # entities_with_colors = highlight_entities(
553
  # aligned_sentence["input_sentence"],
554
  # aligned_sentence["matched_sentence"],
555
  # )
556
+
557
  self.governor_table.append(
558
  [
559
  aligned_sentence,
560
  equal_idx_1,
561
  equal_idx_2,
562
  aligned_sentence["entities"],
563
+ ],
564
  )
565
 
566
  formatted_row = self.format_text_governor_row(max_length)
567
  rows.append(formatted_row)
568
+
569
  table = "\n".join(rows)
570
  return f"""
571
+ <h5>Comparison between input news and source news:</h5>
572
+ <table border="1" style="width:100%; text-align:left;">
573
+ <col style="width: 170px;">
574
+ <col style="width: 170px;">
575
+ <col style="width: 30px;">
576
+ <col style="width: 75px;">
577
+ <thead>
578
+ <tr>
579
+ <th>Input news</th>
580
+ <th>Source (corresponding URL provided in Originality)</th>
581
+ <th>Forensic</th>
582
+ <th>Originality</th>
583
+ </tr>
584
+ </thead>
585
+ <tbody>
586
+ {table}
587
+ </tbody>
588
+ </table>
589
+
590
+ <style>
591
  """
592
 
593
+ def format_text_governor_row(self, max_length=30):
594
  input_sentences = ""
595
  source_sentences = ""
596
  source_text_urls = ""
597
  label = ""
 
598
  sentence_count = 0
599
  entity_count = 0
600
  for row in self.governor_table:
601
  print(f"governor_row: {row}")
602
  if row[0]["input_sentence"] == "":
603
  continue
604
+
605
  if row[0]["matched_sentence"] != "": # source is not empty
606
  # highlight entities
607
+ input_sentence, highlight_idx_input = apply_highlight(
608
+ row[0]["input_sentence"],
609
+ row[3],
610
+ "input",
611
+ entity_count,
612
+ )
613
+ source_sentence, highlight_idx_source = apply_highlight(
614
+ row[0]["matched_sentence"],
615
+ row[3],
616
+ "source",
617
+ entity_count,
618
+ )
619
  entity_count += len(row[3])
620
+
621
  # Color overlapping words
622
+ input_sentence = self.color_text(
623
+ input_sentence,
624
+ row[1],
625
+ highlight_idx_input,
626
+ ) # text, index of highlight words
627
+ source_sentence = self.color_text(
628
+ source_sentence,
629
+ row[2],
630
+ highlight_idx_source,
631
+ ) # text, index of highlight words
632
+
633
+ input_sentence = input_sentence.replace(
634
+ "span_style",
635
+ "span style",
636
+ ).replace("1px_4px", "1px 4px")
637
+ source_sentence = source_sentence.replace(
638
+ "span_style",
639
+ "span style",
640
+ ).replace("1px_4px", "1px 4px")
641
 
642
  else:
643
  input_sentence = row[0]["input_sentence"]
644
  source_sentence = row[0]["matched_sentence"]
645
+
646
+ # convert score to HUMAN-based score:
647
  input_sentences += input_sentence + "<br><br>"
648
  source_sentences += source_sentence + "<br><br>"
649
+
 
650
  url = row[0]["url"]
651
  short_url = self.shorten_url(url, max_length)
652
  source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
653
  sentence_count += 1
654
+
655
  score, label = self.calculate_score_label()
656
  entity_count_text = self.get_entity_count_text(entity_count)
657
 
658
  return f"""
659
+ <tr>
660
+ <td>{input_sentences}</td>
661
+ <td>{source_sentences}</td>
662
+ <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>
663
+ <td>{source_text_urls}</td>
664
+ </tr>
665
  """
666
 
667
  def format_image_governor_row(self, max_length=30):
668
+ if (
669
+ self.image_referent_url is not None
670
+ or self.image_referent_url != ""
671
+ ):
672
+ source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">""" # noqa: E501
673
  short_url = self.shorten_url(self.image_referent_url, max_length)
674
+ source_image_url = (
675
+ f"""<a href="{self.image_referent_url}">{short_url}</a>"""
676
+ )
677
  else:
678
  source_image = "Image not found"
679
  source_image_url = ""
680
 
681
+ return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>""" # noqa: E501
 
682
 
683
  def get_entity_count_text(self, entity_count):
684
  if entity_count <= 0:
 
692
  def shorten_url(self, url, max_length=30):
693
  if url is None:
694
  return ""
695
+
696
  if len(url) > max_length:
697
  short_url = url[:max_length] + "..."
698
  else:
699
  short_url = url
700
  return short_url
701
 
 
702
  def color_text(self, text, colored_idx, highlighted_idx):
703
  paragraph = ""
704
  words = text.split()
705
+
706
  starts, ends = self.extract_starts_ends(colored_idx)
707
  starts, ends = self.filter_indices(starts, ends, highlighted_idx)
708
 
709
  previous_end = 0
710
  for start, end in zip(starts, ends):
711
  paragraph += " ".join(words[previous_end:start])
712
+
713
  equal_words = " ".join(words[start:end])
714
  paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
715
+
716
  previous_end = end
717
+
718
+ # Some left words due to the punctuation separated from
719
  # the highlighting text
720
  equal_words = " ".join(words[previous_end:])
721
  print(f"starts_2: {previous_end}")
722
+ print(f"ends_2: {len(words) - 1}")
723
  print(f"equal_words: {words[previous_end:]}")
724
  paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
725
 
726
  return paragraph
727
+
728
  def extract_starts_ends(self, colored_idx):
729
  starts = []
730
  ends = []
731
  for index in colored_idx:
732
+ starts.append(index["start"])
733
+ ends.append(index["end"])
734
  return starts, ends
735
+
 
736
  def filter_indices(self, starts, ends, ignore_indices):
737
  """
738
+ Filters start and end indices to exclude any indices present in the
739
+ ignore_indices list.
740
 
741
  Args:
742
  starts: A list of starting indices.
 
744
  ignore_indices: A list of indices to exclude.
745
 
746
  Returns:
747
+ A tuple of two lists: filtered_starts and filtered_ends.
748
+ Returns empty lists if the input is invalid
749
+ or if all ranges are filtered out.
750
  Prints error messages for invalid input.
751
+
752
  Examples:
753
  starts = [0, 5, 10]
754
  ends = [3, 7, 12]
755
  ignore_indices = [1, 2, 11, 17]
756
+
757
+ # Output:
758
  starts = [0, 3, 5, 10, 12]
759
  ends = [0, 3, 7, 10, 12]
760
 
761
  """
762
 
763
  if len(starts) != len(ends):
764
+ print(
765
+ "Error: The 'starts' and 'ends' lists must have the same length.", # noqa: E501
766
+ )
767
  return [], []
768
 
769
  filtered_starts = []
 
774
  end = ends[i]
775
 
776
  if end < start:
777
+ print(
778
+ f"Error: End index {end} is less than start index {start} at position {i}.", # noqa: E501
779
+ )
780
  return [], []
781
 
 
782
  start_end = list(range(start, end + 1, 1))
783
  start_end = list(set(start_end) - set(ignore_indices))
784
  new_start, new_end = self.extract_sequences(start_end)
 
790
  def extract_sequences(self, numbers):
791
  if len(numbers) == 1:
792
  return [numbers[0]], [numbers[0]]
793
+
794
  numbers.sort()
795
  starts = []
796
  ends = []
 
799
  start = number
800
  end = number
801
  continue
802
+
803
+ if number - 1 == numbers[i - 1]:
804
  end = number
805
  else:
806
  starts.append(start)
807
  ends.append(end + 1)
808
  start = number
809
  end = number
810
+
811
  if i == len(numbers) - 1:
812
  starts.append(start)
813
  ends.append(end + 1)
814
+
815
  return starts, ends
816
+
817
  def calculate_score_label(self):
818
  human_score = []
819
  machine_score = []
 
826
  elif sentence["label"] == "MACHINE":
827
  machine_score.append(1 - sentence["similarity"])
828
  machine_flag = True
829
+
830
  if machine_flag is True and len(machine_score) > 0:
831
  # average value of machine_score
832
  machine_score_avg = sum(machine_score) / len(machine_score)
 
839
  return human_score_avg, "HUMAN"
840
  else:
841
  return 0, "UNKNOWN"
 
 
src/application/content_generation.py CHANGED
@@ -1,25 +1,27 @@
1
  import json
 
 
2
  import openai
3
  from dotenv import load_dotenv
4
- import os
5
 
6
  load_dotenv()
7
- AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
8
- AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
9
- AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
10
 
11
  client = openai.AzureOpenAI(
12
- api_version = AZURE_OPENAI_API_VERSION,
13
- api_key = AZURE_OPENAI_API_KEY,
14
- azure_endpoint = AZURE_OPENAI_ENDPOINT,
15
- )
 
16
 
17
  def generate_fake_text(text_generation_model, title, content):
18
  # Generate text using the selected models
19
- prompt = """Generate a random fake news tittle in this format:
20
  ---
21
  # Title: [Fake Title]
22
- # Content:
23
  [Fake Content]
24
  ---
25
  """
@@ -32,22 +34,25 @@ def generate_fake_text(text_generation_model, title, content):
32
  elif content:
33
  prompt += """base on the following context:
34
  # Content: {news_content}"""
35
-
36
  # Generate text using the text generation model
37
- # Generate text using the selected model
38
  try:
39
  response = client.chat.completions.create(
40
- model=text_generation_model,
41
- messages = [{"role": "system", "content": prompt}],
 
 
 
 
 
42
  )
43
-
44
- print("Response from OpenAI API: ", response.choices[0].message.content)
45
  fake_text = response.choices[0].message.content
46
 
47
  except openai.OpenAIError as e:
48
  print(f"Error interacting with OpenAI API: {e}")
49
- fake_text = ""
50
-
51
  if fake_text != "":
52
  fake_title, fake_content = extract_title_content(fake_text)
53
  return fake_title, fake_content
@@ -57,12 +62,12 @@ def extract_title_content(fake_news):
57
  """
58
  Extracts the title and content from the generated fake news string.
59
 
60
- This function parses a string containing fake news, which is expected to have
61
- a specific format with a title and content section marked by '# Title:' and
62
- '# Content:' respectively.
63
 
64
  Args:
65
- fake_news (str): A string containing the generated fake news in the expected format.
66
 
67
  Returns:
68
  tuple: A tuple containing two elements:
@@ -77,33 +82,36 @@ def extract_title_content(fake_news):
77
  title_start_index = fake_news.find("# Title: ") + len("# Title: ")
78
  title_end_index = fake_news.find("\n", title_start_index)
79
  title = fake_news[title_start_index:title_end_index].strip()
80
-
81
- content_start_index = fake_news.find("\n# Content: ") + len("\n# Content: ")
 
 
82
  content = fake_news[content_start_index:].strip()
83
-
84
  return title, content
85
 
 
86
  def generate_fake_image(model, title):
87
  if len(title) > 0:
88
  IMAGE_PROMPT = f"Generate a random image about {title}"
89
  else:
90
  IMAGE_PROMPT = "Generate a random image"
91
  result = client.images.generate(
92
- model="dall-e-3", # the name of your DALL-E 3 deployment
93
  prompt=IMAGE_PROMPT,
94
- n=1
95
  )
96
- image_url = json.loads(result.model_dump_json())['data'][0]['url']
97
  return image_url
98
-
99
-
100
  def replace_text(news_title, news_content, replace_df):
101
  """
102
  Replaces occurrences in the input text based on the provided DataFrame.
103
 
104
  Args:
105
  text: The input text.
106
- replace_df: A pandas DataFrame with two columns: "find_what" and "replace_with".
107
 
108
  Returns:
109
  The text after all replacements have been made.
@@ -113,4 +121,4 @@ def replace_text(news_title, news_content, replace_df):
113
  replace_with = row["Replace with:"]
114
  news_content = news_content.replace(find_what, replace_with)
115
  news_title = news_title.replace(find_what, replace_with)
116
- return news_title, news_content
 
1
  import json
2
+ import os
3
+
4
  import openai
5
  from dotenv import load_dotenv
 
6
 
7
  load_dotenv()
8
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
9
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
10
+ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
11
 
12
  client = openai.AzureOpenAI(
13
+ api_version=AZURE_OPENAI_API_VERSION,
14
+ api_key=AZURE_OPENAI_API_KEY,
15
+ azure_endpoint=AZURE_OPENAI_ENDPOINT,
16
+ )
17
+
18
 
19
  def generate_fake_text(text_generation_model, title, content):
20
  # Generate text using the selected models
21
+ prompt = """Generate a random fake news tittle in this format:
22
  ---
23
  # Title: [Fake Title]
24
+ # Content:
25
  [Fake Content]
26
  ---
27
  """
 
34
  elif content:
35
  prompt += """base on the following context:
36
  # Content: {news_content}"""
37
+
38
  # Generate text using the text generation model
39
+ # Generate text using the selected model
40
  try:
41
  response = client.chat.completions.create(
42
+ model=text_generation_model,
43
+ messages=[{"role": "system", "content": prompt}],
44
+ )
45
+
46
+ print(
47
+ "Response from OpenAI API: ",
48
+ response.choices[0].message.content,
49
  )
 
 
50
  fake_text = response.choices[0].message.content
51
 
52
  except openai.OpenAIError as e:
53
  print(f"Error interacting with OpenAI API: {e}")
54
+ fake_text = ""
55
+
56
  if fake_text != "":
57
  fake_title, fake_content = extract_title_content(fake_text)
58
  return fake_title, fake_content
 
62
  """
63
  Extracts the title and content from the generated fake news string.
64
 
65
+ This function parses a string containing fake news, which is expected
66
+ to have a specific format with a title and content section marked by
67
+ '# Title:' and '# Content:' respectively.
68
 
69
  Args:
70
+ fake_news (str): A string containing the generated fake news.
71
 
72
  Returns:
73
  tuple: A tuple containing two elements:
 
82
  title_start_index = fake_news.find("# Title: ") + len("# Title: ")
83
  title_end_index = fake_news.find("\n", title_start_index)
84
  title = fake_news[title_start_index:title_end_index].strip()
85
+
86
+ content_start_index = fake_news.find("\n# Content: ") + len(
87
+ "\n# Content: ",
88
+ )
89
  content = fake_news[content_start_index:].strip()
90
+
91
  return title, content
92
 
93
+
94
  def generate_fake_image(model, title):
95
  if len(title) > 0:
96
  IMAGE_PROMPT = f"Generate a random image about {title}"
97
  else:
98
  IMAGE_PROMPT = "Generate a random image"
99
  result = client.images.generate(
100
+ model="dall-e-3", # the name of your DALL-E 3 deployment
101
  prompt=IMAGE_PROMPT,
102
+ n=1,
103
  )
104
+ image_url = json.loads(result.model_dump_json())["data"][0]["url"]
105
  return image_url
106
+
107
+
108
  def replace_text(news_title, news_content, replace_df):
109
  """
110
  Replaces occurrences in the input text based on the provided DataFrame.
111
 
112
  Args:
113
  text: The input text.
114
+ replace_df: A DF with 2 columns: "find_what" & "replace_with".
115
 
116
  Returns:
117
  The text after all replacements have been made.
 
121
  replace_with = row["Replace with:"]
122
  news_content = news_content.replace(find_what, replace_with)
123
  news_title = news_title.replace(find_what, replace_with)
124
+ return news_title, news_content
src/application/image/image_comparison.py CHANGED
@@ -1,9 +1,12 @@
1
- import requests
2
  from io import BytesIO
3
- from PIL import Image
4
  import imagehash
 
 
 
5
  from src.application.image.search_yandex import YandexReverseImageSearcher
6
 
 
7
  def get_image_from_url(url):
8
  try:
9
  response = requests.get(url)
@@ -12,6 +15,7 @@ def get_image_from_url(url):
12
  print(f"Error opening image: {e}")
13
  return None
14
 
 
15
  def get_image_from_file(file_path):
16
  try:
17
  return Image.open(file_path)
@@ -19,33 +23,36 @@ def get_image_from_file(file_path):
19
  print(f"Error occurred while opening image from file: {file_path}")
20
  return None
21
 
 
22
  def standardize_image(image):
23
  # Convert to RGB if needed
24
- if image.mode in ('RGBA', 'LA'):
25
- background = Image.new('RGB', image.size, (255, 255, 255))
26
  background.paste(image, mask=image.split()[-1])
27
  image = background
28
- elif image.mode != 'RGB':
29
- image = image.convert('RGB')
30
-
31
  # Resize to standard size (e.g. 256x256)
32
  standard_size = (256, 256)
33
  image = image.resize(standard_size)
34
-
35
  return image
36
 
 
37
  def compare_images(image1, image2):
38
  # Standardize both images first
39
  img1_std = standardize_image(image1)
40
  img2_std = standardize_image(image2)
41
-
42
  hash1 = imagehash.average_hash(img1_std)
43
  hash2 = imagehash.average_hash(img2_std)
44
  return hash1 - hash2 # Returns the Hamming distance between the hashes
45
 
46
- if __name__ == '__main__':
47
- image_url = 'https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png'
48
-
 
49
  # Get the image from URL
50
  url_image = get_image_from_url(image_url)
51
 
@@ -54,13 +61,13 @@ if __name__ == '__main__':
54
  res = rev_img_searcher.search(image_url)
55
 
56
  for search_item in res:
57
- print(f'Title: {search_item.page_title}')
58
  # print(f'Site: {search_item.page_url}')
59
- print(f'Img: {search_item.image_url}\n')
60
 
61
  # Compare each search result image with the input image
62
  result_image = get_image_from_url(search_item.image_url)
63
  result_difference = compare_images(result_image, url_image)
64
  print(f"Difference with search result: {result_difference}")
65
- if result_difference == 0:
66
- break
 
 
1
  from io import BytesIO
2
+
3
  import imagehash
4
+ import requests
5
+ from PIL import Image
6
+
7
  from src.application.image.search_yandex import YandexReverseImageSearcher
8
 
9
+
10
  def get_image_from_url(url):
11
  try:
12
  response = requests.get(url)
 
15
  print(f"Error opening image: {e}")
16
  return None
17
 
18
+
19
  def get_image_from_file(file_path):
20
  try:
21
  return Image.open(file_path)
 
23
  print(f"Error occurred while opening image from file: {file_path}")
24
  return None
25
 
26
+
27
  def standardize_image(image):
28
  # Convert to RGB if needed
29
+ if image.mode in ("RGBA", "LA"):
30
+ background = Image.new("RGB", image.size, (255, 255, 255))
31
  background.paste(image, mask=image.split()[-1])
32
  image = background
33
+ elif image.mode != "RGB":
34
+ image = image.convert("RGB")
35
+
36
  # Resize to standard size (e.g. 256x256)
37
  standard_size = (256, 256)
38
  image = image.resize(standard_size)
39
+
40
  return image
41
 
42
+
43
  def compare_images(image1, image2):
44
  # Standardize both images first
45
  img1_std = standardize_image(image1)
46
  img2_std = standardize_image(image2)
47
+
48
  hash1 = imagehash.average_hash(img1_std)
49
  hash2 = imagehash.average_hash(img2_std)
50
  return hash1 - hash2 # Returns the Hamming distance between the hashes
51
 
52
+
53
+ if __name__ == "__main__":
54
+ image_url = "https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png" # noqa: E501
55
+
56
  # Get the image from URL
57
  url_image = get_image_from_url(image_url)
58
 
 
61
  res = rev_img_searcher.search(image_url)
62
 
63
  for search_item in res:
64
+ print(f"Title: {search_item.page_title}")
65
  # print(f'Site: {search_item.page_url}')
66
+ print(f"Img: {search_item.image_url}\n")
67
 
68
  # Compare each search result image with the input image
69
  result_image = get_image_from_url(search_item.image_url)
70
  result_difference = compare_images(result_image, url_image)
71
  print(f"Difference with search result: {result_difference}")
72
+ if result_difference == 0:
73
+ break
src/application/image/image_detection.py CHANGED
@@ -1,14 +1,19 @@
1
-
2
- from src.application.image.image_comparison import compare_images, get_image_from_file, get_image_from_url
 
 
 
3
  from src.application.image.model_detection import image_generation_detection
4
  from src.application.image.search_yandex import yandex_reverse_image_search
5
 
6
 
7
  def compare_list_of_images(news_image_path, img_urls):
8
- news_image = get_image_from_file(news_image_path) # TODO: news_image_path is arrays
 
 
9
  if news_image is None:
10
  return None, -1
11
-
12
  matched_url = ""
13
  max_similarity = 0
14
  for url in img_urls:
@@ -20,7 +25,10 @@ def compare_list_of_images(news_image_path, img_urls):
20
  referred_image = get_image_from_url(url)
21
  if referred_image is None:
22
  continue
23
- distance = compare_images(news_image, referred_image) # Hamming algorithm
 
 
 
24
  similarity = max(100 - distance, 0)
25
  if similarity > max_similarity:
26
  max_similarity = similarity
@@ -29,14 +37,17 @@ def compare_list_of_images(news_image_path, img_urls):
29
  if max_similarity > 90:
30
  return matched_url, max_similarity
31
  return None, -1
32
-
33
-
34
  def detect_image_from_news_image(news_image_path, image_urls):
35
  print("\tFrom news:")
36
  return compare_list_of_images(news_image_path, image_urls)
37
 
 
38
  def detect_image_by_reverse_search(news_image_path):
39
- image_urls = yandex_reverse_image_search(news_image_path) # url or file_path
 
 
40
  print("\tFrom search engine:")
41
  for url in image_urls:
42
  print(f"\t\t{url}")
@@ -47,5 +58,5 @@ def detect_image_by_ai_model(news_image_path):
47
  print("\tFrom AI model:")
48
  image_prediction_label, image_confidence = image_generation_detection(
49
  news_image_path,
50
- )
51
- return image_prediction_label, image_confidence
 
1
+ from src.application.image.image_comparison import (
2
+ compare_images,
3
+ get_image_from_file,
4
+ get_image_from_url,
5
+ )
6
  from src.application.image.model_detection import image_generation_detection
7
  from src.application.image.search_yandex import yandex_reverse_image_search
8
 
9
 
10
  def compare_list_of_images(news_image_path, img_urls):
11
+ news_image = get_image_from_file(
12
+ news_image_path,
13
+ ) # TODO: news_image_path is arrays
14
  if news_image is None:
15
  return None, -1
16
+
17
  matched_url = ""
18
  max_similarity = 0
19
  for url in img_urls:
 
25
  referred_image = get_image_from_url(url)
26
  if referred_image is None:
27
  continue
28
+ distance = compare_images(
29
+ news_image,
30
+ referred_image,
31
+ ) # Hamming algorithm
32
  similarity = max(100 - distance, 0)
33
  if similarity > max_similarity:
34
  max_similarity = similarity
 
37
  if max_similarity > 90:
38
  return matched_url, max_similarity
39
  return None, -1
40
+
41
+
42
  def detect_image_from_news_image(news_image_path, image_urls):
43
  print("\tFrom news:")
44
  return compare_list_of_images(news_image_path, image_urls)
45
 
46
+
47
  def detect_image_by_reverse_search(news_image_path):
48
+ image_urls = yandex_reverse_image_search(
49
+ news_image_path,
50
+ ) # url or file_path
51
  print("\tFrom search engine:")
52
  for url in image_urls:
53
  print(f"\t\t{url}")
 
58
  print("\tFrom AI model:")
59
  image_prediction_label, image_confidence = image_generation_detection(
60
  news_image_path,
61
+ )
62
+ return image_prediction_label, image_confidence
src/application/image/model_detection.py CHANGED
@@ -1,23 +1,39 @@
1
- from sklearn.metrics import roc_auc_score
2
- from torchmetrics import Accuracy, Recall
3
  import pytorch_lightning as pl
4
- import timm
5
  import torch
6
  import torch.nn.functional as F
7
- import logging
8
- from PIL import Image
9
  import torchvision.transforms as transforms
 
 
 
 
 
 
10
  from torchvision.transforms import v2
11
 
12
- logging.basicConfig(filename='training.log',filemode='w',level=logging.INFO, force=True)
13
- CHECKPOINT = "models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt"
 
 
 
 
 
 
 
 
14
 
15
  class ImageClassifier(pl.LightningModule):
16
  def __init__(self, lmd=0):
17
  super().__init__()
18
- self.model = timm.create_model('resnet50', pretrained=True, num_classes=1)
19
- self.accuracy = Accuracy(task='binary', threshold=0.5)
20
- self.recall = Recall(task='binary', threshold=0.5)
 
 
 
 
21
  self.validation_outputs = []
22
  self.lmd = lmd
23
 
@@ -27,13 +43,13 @@ class ImageClassifier(pl.LightningModule):
27
  def training_step(self, batch):
28
  images, labels, _ = batch
29
  outputs = self.forward(images).squeeze()
30
-
31
  print(f"Shape of outputs (training): {outputs.shape}")
32
  print(f"Shape of labels (training): {labels.shape}")
33
-
34
  loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
35
  logging.info(f"Training Step - ERM loss: {loss.item()}")
36
- loss += self.lmd * (outputs ** 2).mean() # SD loss penalty
37
  logging.info(f"Training Step - SD loss: {loss.item()}")
38
  return loss
39
 
@@ -43,20 +59,30 @@ class ImageClassifier(pl.LightningModule):
43
 
44
  if outputs.shape == torch.Size([]):
45
  return
46
-
47
  print(f"Shape of outputs (validation): {outputs.shape}")
48
  print(f"Shape of labels (validation): {labels.shape}")
49
 
50
  loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
51
  preds = torch.sigmoid(outputs)
52
- self.log('val_loss', loss, prog_bar=True, sync_dist=True)
53
- self.log('val_acc', self.accuracy(preds, labels.int()), prog_bar=True, sync_dist=True)
54
- self.log('val_recall', self.recall(preds, labels.int()), prog_bar=True, sync_dist=True)
 
 
 
 
 
 
 
 
 
 
55
  output = {"val_loss": loss, "preds": preds, "labels": labels}
56
  self.validation_outputs.append(output)
57
  logging.info(f"Validation Step - Batch loss: {loss.item()}")
58
  return output
59
-
60
  def predict_step(self, batch):
61
  images, label, domain = batch
62
  outputs = self.forward(images).squeeze()
@@ -67,13 +93,13 @@ class ImageClassifier(pl.LightningModule):
67
  if not self.validation_outputs:
68
  logging.warning("No outputs in validation step to process")
69
  return
70
- preds = torch.cat([x['preds'] for x in self.validation_outputs])
71
- labels = torch.cat([x['labels'] for x in self.validation_outputs])
72
  if labels.unique().size(0) == 1:
73
  logging.warning("Only one class in validation step")
74
  return
75
  auc_score = roc_auc_score(labels.cpu(), preds.cpu())
76
- self.log('val_auc', auc_score, prog_bar=True, sync_dist=True)
77
  logging.info(f"Validation Epoch End - AUC score: {auc_score}")
78
  self.validation_outputs = []
79
 
@@ -82,45 +108,46 @@ class ImageClassifier(pl.LightningModule):
82
  return optimizer
83
 
84
 
85
-
86
  def load_image(image_path, transform=None):
87
- image = Image.open(image_path).convert('RGB')
88
-
89
  if transform:
90
  image = transform(image)
91
-
92
  return image
93
 
94
 
95
  def predict_single_image(image_path, model, transform=None):
96
- image = load_image(image_path, transform)
97
-
98
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
99
-
100
  model.to(device)
101
-
102
  image = image.to(device)
103
 
104
  model.eval()
105
-
106
  with torch.no_grad():
107
- image = image.unsqueeze(0)
108
- output = model(image).squeeze()
109
- prediction = torch.sigmoid(output).item()
110
-
111
  return prediction
112
 
113
 
114
  def image_generation_detection(image_path):
115
  model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
116
 
117
- transform = v2.Compose([
118
- transforms.ToTensor(),
119
- v2.CenterCrop((256, 256)),
120
- ])
 
 
 
 
121
 
122
- prediction = predict_single_image(image_path, model, transform)
123
-
124
  result = ""
125
  if prediction <= 0.2:
126
  result += "Most likely human"
@@ -134,8 +161,8 @@ def image_generation_detection(image_path):
134
  return image_prediction_label, image_confidence
135
 
136
 
137
- if __name__ == "__main__":
138
  image_path = "path_to_your_image.jpg" # Replace with your image path
139
  image_prediction_label, image_confidence = image_generation_detection(
140
  image_path,
141
- )
 
1
+ import logging
2
+
3
  import pytorch_lightning as pl
4
+ import timm
5
  import torch
6
  import torch.nn.functional as F
 
 
7
  import torchvision.transforms as transforms
8
+ from PIL import Image
9
+ from sklearn.metrics import roc_auc_score
10
+ from torchmetrics import (
11
+ Accuracy,
12
+ Recall,
13
+ )
14
  from torchvision.transforms import v2
15
 
16
+ logging.basicConfig(
17
+ filename="training.log",
18
+ filemode="w",
19
+ level=logging.INFO,
20
+ force=True,
21
+ )
22
+ CHECKPOINT = (
23
+ "models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt"
24
+ )
25
+
26
 
27
  class ImageClassifier(pl.LightningModule):
28
  def __init__(self, lmd=0):
29
  super().__init__()
30
+ self.model = timm.create_model(
31
+ "resnet50",
32
+ pretrained=True,
33
+ num_classes=1,
34
+ )
35
+ self.accuracy = Accuracy(task="binary", threshold=0.5)
36
+ self.recall = Recall(task="binary", threshold=0.5)
37
  self.validation_outputs = []
38
  self.lmd = lmd
39
 
 
43
  def training_step(self, batch):
44
  images, labels, _ = batch
45
  outputs = self.forward(images).squeeze()
46
+
47
  print(f"Shape of outputs (training): {outputs.shape}")
48
  print(f"Shape of labels (training): {labels.shape}")
49
+
50
  loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
51
  logging.info(f"Training Step - ERM loss: {loss.item()}")
52
+ loss += self.lmd * (outputs**2).mean() # SD loss penalty
53
  logging.info(f"Training Step - SD loss: {loss.item()}")
54
  return loss
55
 
 
59
 
60
  if outputs.shape == torch.Size([]):
61
  return
62
+
63
  print(f"Shape of outputs (validation): {outputs.shape}")
64
  print(f"Shape of labels (validation): {labels.shape}")
65
 
66
  loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
67
  preds = torch.sigmoid(outputs)
68
+ self.log("val_loss", loss, prog_bar=True, sync_dist=True)
69
+ self.log(
70
+ "val_acc",
71
+ self.accuracy(preds, labels.int()),
72
+ prog_bar=True,
73
+ sync_dist=True,
74
+ )
75
+ self.log(
76
+ "val_recall",
77
+ self.recall(preds, labels.int()),
78
+ prog_bar=True,
79
+ sync_dist=True,
80
+ )
81
  output = {"val_loss": loss, "preds": preds, "labels": labels}
82
  self.validation_outputs.append(output)
83
  logging.info(f"Validation Step - Batch loss: {loss.item()}")
84
  return output
85
+
86
  def predict_step(self, batch):
87
  images, label, domain = batch
88
  outputs = self.forward(images).squeeze()
 
93
  if not self.validation_outputs:
94
  logging.warning("No outputs in validation step to process")
95
  return
96
+ preds = torch.cat([x["preds"] for x in self.validation_outputs])
97
+ labels = torch.cat([x["labels"] for x in self.validation_outputs])
98
  if labels.unique().size(0) == 1:
99
  logging.warning("Only one class in validation step")
100
  return
101
  auc_score = roc_auc_score(labels.cpu(), preds.cpu())
102
+ self.log("val_auc", auc_score, prog_bar=True, sync_dist=True)
103
  logging.info(f"Validation Epoch End - AUC score: {auc_score}")
104
  self.validation_outputs = []
105
 
 
108
  return optimizer
109
 
110
 
 
111
  def load_image(image_path, transform=None):
112
+ image = Image.open(image_path).convert("RGB")
113
+
114
  if transform:
115
  image = transform(image)
116
+
117
  return image
118
 
119
 
120
  def predict_single_image(image_path, model, transform=None):
121
+ image = load_image(image_path, transform)
122
+
123
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
124
+
125
  model.to(device)
126
+
127
  image = image.to(device)
128
 
129
  model.eval()
130
+
131
  with torch.no_grad():
132
+ image = image.unsqueeze(0)
133
+ output = model(image).squeeze()
134
+ prediction = torch.sigmoid(output).item()
135
+
136
  return prediction
137
 
138
 
139
  def image_generation_detection(image_path):
140
  model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
141
 
142
+ transform = v2.Compose(
143
+ [
144
+ transforms.ToTensor(),
145
+ v2.CenterCrop((256, 256)),
146
+ ],
147
+ )
148
+
149
+ prediction = predict_single_image(image_path, model, transform)
150
 
 
 
151
  result = ""
152
  if prediction <= 0.2:
153
  result += "Most likely human"
 
161
  return image_prediction_label, image_confidence
162
 
163
 
164
+ if __name__ == "__main__":
165
  image_path = "path_to_your_image.jpg" # Replace with your image path
166
  image_prediction_label, image_confidence = image_generation_detection(
167
  image_path,
168
+ )
src/application/image/search_yandex.py CHANGED
@@ -1,17 +1,22 @@
1
- import time
2
  import logging
 
 
 
 
 
 
3
  import requests
4
- import json
5
  from bs4 import BeautifulSoup
6
- from urllib.parse import quote, urlparse
7
 
8
  logging.basicConfig(
9
- filename='error.log',
10
  level=logging.INFO,
11
- format='%(asctime)s | [%(levelname)s]: %(message)s',
12
- datefmt='%m-%d-%Y / %I:%M:%S %p'
13
  )
14
 
 
15
  class SearchResults:
16
  def __init__(self, results):
17
  self.results = results
@@ -25,20 +30,29 @@ class SearchResults:
25
  output += "---\n"
26
  return output
27
 
 
28
  class YandexReverseImageSearcher:
29
  def __init__(self):
30
  self.base_url = "https://yandex.ru/images/search"
31
- self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
 
 
32
  self.retry_count = 3
33
  self.retry_delay = 1
34
 
35
- def response(self, query: str, image_url: str, max_results: int = 10, delay: int = 1) -> SearchResults:
 
 
 
 
 
 
36
  self._validate_input(query, image_url)
37
-
38
  encoded_query = quote(query)
39
  encoded_image_url = quote(image_url)
40
 
41
- url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2"
42
 
43
  all_results = []
44
  start_index = 0
@@ -46,14 +60,16 @@ class YandexReverseImageSearcher:
46
  while len(all_results) < max_results:
47
  if start_index != 0:
48
  time.sleep(delay)
49
-
50
  paginated_url = f"{url}&start={start_index}"
51
 
52
  response = self._make_request(paginated_url)
53
  if response is None:
54
  break
55
 
56
- search_results, valid_content = self._parse_search_results(response.text)
 
 
57
  if not valid_content:
58
  logging.warning("Unexpected HTML structure encountered.")
59
  break
@@ -65,34 +81,44 @@ class YandexReverseImageSearcher:
65
  if data and data not in all_results:
66
  all_results.append(data)
67
 
68
- start_index += (len(all_results)-start_index)
69
 
70
  if len(all_results) == 0:
71
- logging.warning(f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].")
72
- return "No results found. Please try again with a different query and/or image URL."
 
 
73
  else:
74
  return SearchResults(all_results[:max_results])
75
-
76
  def _validate_input(self, query: str, image_url: str):
77
  if not query:
78
- raise ValueError("Query not found. Please enter a query and try again.")
 
 
79
  if not image_url:
80
- raise ValueError("Image URL not found. Please enter an image URL and try again.")
 
 
81
  if not self._validate_image_url(image_url):
82
- raise ValueError("Invalid image URL. Please enter a valid image URL and try again.")
83
-
 
 
84
  def _validate_image_url(self, url: str) -> bool:
85
  parsed_url = urlparse(url)
86
  path = parsed_url.path.lower()
87
  valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
88
  return any(path.endswith(ext) for ext in valid_extensions)
89
-
90
  def _make_request(self, url: str):
91
  attempts = 0
92
  while attempts < self.retry_count:
93
  try:
94
  response = requests.get(url, headers=self.headers)
95
- if response.headers.get('Content-Type', '').startswith('text/html'):
 
 
96
  response.raise_for_status()
97
  return response
98
  else:
@@ -110,14 +136,22 @@ class YandexReverseImageSearcher:
110
  def _parse_search_results(self, html_content: str):
111
  try:
112
  soup = BeautifulSoup(html_content, "html.parser")
113
- return soup.find_all('div', class_='g'), True
114
  except Exception as e:
115
  logging.error(f"Error parsing HTML content: {e}")
116
  return None, False
117
 
118
  def _extract_result_data(self, result):
119
- link = result.find('a', href=True)['href'] if result.find('a', href=True) else None
120
- title = result.find('h3').get_text(strip=True) if result.find('h3') else None
 
 
 
 
 
 
 
 
121
  return {"link": link, "title": title} if link and title else {}
122
 
123
 
@@ -131,24 +165,27 @@ def get_image_links(page):
131
  Returns:
132
  A list of image URLs.
133
  """
134
- soup = BeautifulSoup(page, 'html.parser')
135
-
136
  # Find the specific section containing image links
137
- gallery_data = soup.find('div', {'class': 'cbir-section cbir-section_name_sites'})
 
 
 
138
  if gallery_data is None:
139
  return []
140
-
141
  # Find the container of image links
142
- image_links_container = gallery_data.find('div', {'class': 'Root'})
143
  if image_links_container is None:
144
  return []
145
-
146
- data_state = json.loads(image_links_container['data-state'])
147
 
148
  # Extract URLs from each div
149
  image_urls = []
150
- for site in data_state['sites']:
151
- original_image_url = site['originalImage']['url']
152
  image_urls.append(original_image_url)
153
 
154
  return image_urls
@@ -158,19 +195,19 @@ def yandex_reverse_image_search(file_path):
158
  img_search_url = generate_images_search_links(file_path)
159
  if img_search_url is None:
160
  return []
161
-
162
  # Simulate a user agent to avoid being blocked
163
  headers = {
164
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
165
- 'Content-Type': 'application/json',
166
  }
167
-
168
  try:
169
  response = requests.get(img_search_url, headers=headers)
170
  response.raise_for_status() # Raise an exception for bad status codes
171
 
172
  # Parse the HTML content
173
- soup = BeautifulSoup(response.content, 'html.parser')
174
  image_urls = get_image_links(soup.prettify())
175
  return image_urls
176
 
@@ -180,21 +217,28 @@ def yandex_reverse_image_search(file_path):
180
 
181
 
182
  def generate_images_search_links(file_path):
183
- search_url = 'https://yandex.ru/images/search'
184
- params = {'rpt': 'imageview', 'format': 'json', 'request': '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}'}
185
-
 
 
 
 
186
  try:
187
- files = {'upfile': ('blob', open(file_path, 'rb'), 'image/jpeg/webp')}
188
  response = requests.post(search_url, params=params, files=files)
189
- query_string = json.loads(response.content)['blocks'][0]['params']['url']
190
- img_search_url = search_url + '?' + query_string
 
 
191
  return img_search_url
192
- except:
 
193
  return None
194
 
195
 
196
  if __name__ == "__main__":
197
- file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp"
198
  image_urls = yandex_reverse_image_search(file_path)
199
  for image_url in image_urls:
200
  print(f"Image URL: {image_url}")
 
1
+ import json
2
  import logging
3
+ import time
4
+ from urllib.parse import (
5
+ quote,
6
+ urlparse,
7
+ )
8
+
9
  import requests
 
10
  from bs4 import BeautifulSoup
 
11
 
12
  logging.basicConfig(
13
+ filename="error.log",
14
  level=logging.INFO,
15
+ format="%(asctime)s | [%(levelname)s]: %(message)s",
16
+ datefmt="%m-%d-%Y / %I:%M:%S %p",
17
  )
18
 
19
+
20
  class SearchResults:
21
  def __init__(self, results):
22
  self.results = results
 
30
  output += "---\n"
31
  return output
32
 
33
+
34
  class YandexReverseImageSearcher:
35
  def __init__(self):
36
  self.base_url = "https://yandex.ru/images/search"
37
+ self.headers = {
38
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # noqa: E501
39
+ }
40
  self.retry_count = 3
41
  self.retry_delay = 1
42
 
43
+ def response(
44
+ self,
45
+ query: str,
46
+ image_url: str,
47
+ max_results: int = 10,
48
+ delay: int = 1,
49
+ ) -> SearchResults:
50
  self._validate_input(query, image_url)
51
+
52
  encoded_query = quote(query)
53
  encoded_image_url = quote(image_url)
54
 
55
+ url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2" # noqa: E501
56
 
57
  all_results = []
58
  start_index = 0
 
60
  while len(all_results) < max_results:
61
  if start_index != 0:
62
  time.sleep(delay)
63
+
64
  paginated_url = f"{url}&start={start_index}"
65
 
66
  response = self._make_request(paginated_url)
67
  if response is None:
68
  break
69
 
70
+ search_results, valid_content = self._parse_search_results(
71
+ response.text,
72
+ )
73
  if not valid_content:
74
  logging.warning("Unexpected HTML structure encountered.")
75
  break
 
81
  if data and data not in all_results:
82
  all_results.append(data)
83
 
84
+ start_index += len(all_results) - start_index
85
 
86
  if len(all_results) == 0:
87
+ logging.warning(
88
+ f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].", # noqa: E501
89
+ )
90
+ return "No results found. Please try again with a different query and/or image URL." # noqa: E501
91
  else:
92
  return SearchResults(all_results[:max_results])
93
+
94
  def _validate_input(self, query: str, image_url: str):
95
  if not query:
96
+ raise ValueError(
97
+ "Query not found. Enter a query and try again.",
98
+ )
99
  if not image_url:
100
+ raise ValueError(
101
+ "Image URL not found. Enter an image URL and try again.",
102
+ )
103
  if not self._validate_image_url(image_url):
104
+ raise ValueError(
105
+ "Invalid image URL. Enter a valid image URL and try again.",
106
+ )
107
+
108
  def _validate_image_url(self, url: str) -> bool:
109
  parsed_url = urlparse(url)
110
  path = parsed_url.path.lower()
111
  valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
112
  return any(path.endswith(ext) for ext in valid_extensions)
113
+
114
  def _make_request(self, url: str):
115
  attempts = 0
116
  while attempts < self.retry_count:
117
  try:
118
  response = requests.get(url, headers=self.headers)
119
+ if response.headers.get("Content-Type", "").startswith(
120
+ "text/html",
121
+ ):
122
  response.raise_for_status()
123
  return response
124
  else:
 
136
  def _parse_search_results(self, html_content: str):
137
  try:
138
  soup = BeautifulSoup(html_content, "html.parser")
139
+ return soup.find_all("div", class_="g"), True
140
  except Exception as e:
141
  logging.error(f"Error parsing HTML content: {e}")
142
  return None, False
143
 
144
  def _extract_result_data(self, result):
145
+ link = (
146
+ result.find("a", href=True)["href"]
147
+ if result.find("a", href=True)
148
+ else None
149
+ )
150
+ title = (
151
+ result.find("h3").get_text(strip=True)
152
+ if result.find("h3")
153
+ else None
154
+ )
155
  return {"link": link, "title": title} if link and title else {}
156
 
157
 
 
165
  Returns:
166
  A list of image URLs.
167
  """
168
+ soup = BeautifulSoup(page, "html.parser")
169
+
170
  # Find the specific section containing image links
171
+ gallery_data = soup.find(
172
+ "div",
173
+ {"class": "cbir-section cbir-section_name_sites"},
174
+ )
175
  if gallery_data is None:
176
  return []
177
+
178
  # Find the container of image links
179
+ image_links_container = gallery_data.find("div", {"class": "Root"})
180
  if image_links_container is None:
181
  return []
182
+
183
+ data_state = json.loads(image_links_container["data-state"])
184
 
185
  # Extract URLs from each div
186
  image_urls = []
187
+ for site in data_state["sites"]:
188
+ original_image_url = site["originalImage"]["url"]
189
  image_urls.append(original_image_url)
190
 
191
  return image_urls
 
195
  img_search_url = generate_images_search_links(file_path)
196
  if img_search_url is None:
197
  return []
198
+
199
  # Simulate a user agent to avoid being blocked
200
  headers = {
201
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", # noqa: E501
202
+ "Content-Type": "application/json",
203
  }
204
+
205
  try:
206
  response = requests.get(img_search_url, headers=headers)
207
  response.raise_for_status() # Raise an exception for bad status codes
208
 
209
  # Parse the HTML content
210
+ soup = BeautifulSoup(response.content, "html.parser")
211
  image_urls = get_image_links(soup.prettify())
212
  return image_urls
213
 
 
217
 
218
 
219
  def generate_images_search_links(file_path):
220
+ search_url = "https://yandex.ru/images/search"
221
+ params = {
222
+ "rpt": "imageview",
223
+ "format": "json",
224
+ "request": '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}', # noqa: E501
225
+ }
226
+
227
  try:
228
+ files = {"upfile": ("blob", open(file_path, "rb"), "image/jpeg/webp")}
229
  response = requests.post(search_url, params=params, files=files)
230
+ query_string = json.loads(response.content)["blocks"][0]["params"][
231
+ "url"
232
+ ]
233
+ img_search_url = search_url + "?" + query_string
234
  return img_search_url
235
+ except requests.exceptions as e:
236
+ print(f"Error generating search URL: {e}")
237
  return None
238
 
239
 
240
  if __name__ == "__main__":
241
+ file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp" # noqa: E501
242
  image_urls = yandex_reverse_image_search(file_path)
243
  for image_url in image_urls:
244
  print(f"Image URL: {image_url}")
src/application/text/entity.py CHANGED
@@ -1,42 +1,51 @@
1
  import colorsys
2
  import json
 
3
  import re
 
 
4
  import openai
5
  from dotenv import load_dotenv
6
- import os
7
  from transformers import pipeline
8
- import gradio as gr
9
 
10
  ner_pipeline = pipeline("ner")
11
 
12
  load_dotenv()
13
- AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
14
- AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
15
- AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
16
 
17
  client = openai.AzureOpenAI(
18
- api_version = "2024-05-01-preview", # AZURE_OPENAI_API_VERSION,
19
- api_key = AZURE_OPENAI_API_KEY,
20
- azure_endpoint = AZURE_OPENAI_ENDPOINT,
21
- )
22
 
23
 
24
- def extract_entities_gpt(original_text, compared_text, text_generation_model="o1-mini"):
 
 
 
 
25
  # "gpt-4o-mini" or "o1-mini"
26
  # Generate text using the selected models
27
  prompt = f"""
28
- Compare the ORIGINAL TEXT and the COMPARED TEXT.
29
- Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
30
- Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
31
- * **Numerical changes:** e.g., "five" changed to "ten," "10%" changed to "50%"
32
- * **Time changes:** e.g., "Monday" changed to "Sunday," "10th" changed to "21st"
33
- * **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
34
- * **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
35
- * **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
36
-
37
- Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
38
-
39
- Output the extracted entity pairs, one pair per line, in the following JSON-like list format without wrapping characters:
 
 
 
 
40
  [
41
  ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
42
  ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
@@ -50,23 +59,24 @@ If there are no entities that satisfy above condition, output empty list "[]".
50
  # COMPARED TEXT:
51
  {compared_text}
52
  """
53
-
54
  # Generate text using the text generation model
55
- # Generate text using the selected model
56
  try:
57
  response = client.chat.completions.create(
58
- model=text_generation_model,
59
- messages = [{"role": "user", "content": prompt}],
60
  )
61
-
62
  res = response.choices[0].message.content
63
 
64
  except openai.OpenAIError as e:
65
  print(f"Error interacting with OpenAI API: {e}")
66
- res = ""
67
 
68
  return res
69
-
 
70
  def read_json(json_string) -> list[list[str]]:
71
  try:
72
  entities = json.loads(json_string)
@@ -75,53 +85,64 @@ def read_json(json_string) -> list[list[str]]:
75
  for inner_list in entities:
76
  if inner_list not in unique_entities:
77
  unique_entities.append(inner_list)
78
-
79
  return unique_entities
80
 
81
  except json.JSONDecodeError as e:
82
  print(f"Error decoding JSON: {e}")
83
  return []
84
 
 
85
  def lighten_color(hex_color, factor=1.8):
86
  """Lightens a HEX color by increasing its brightness in HSV space."""
87
 
88
  hex_color = hex_color.lstrip("#")
89
- r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
90
-
 
 
 
 
91
  # Convert to HSV
92
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
93
  v = min(1.0, v * factor) # Increase brightness
94
-
95
  # Convert back to HEX
96
- r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
97
- return f'#{r:02x}{g:02x}{b:02x}'
 
98
 
99
  def darken_color(hex_color, factor=0.7):
100
  """Darkens a hex color by reducing its brightness in the HSV space."""
101
 
102
  hex_color = hex_color.lstrip("#")
103
- r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
104
-
 
 
 
 
105
  # Convert to HSV to adjust brightness
106
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
107
  v = max(0, v * factor) # Reduce brightness
108
-
109
  # Convert back to HEX
110
- r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
111
- return f'#{r:02x}{g:02x}{b:02x}'
 
112
 
113
  def generate_color(index, total_colors=20):
114
  """Generates a unique, evenly spaced color for each index using HSL."""
115
 
116
  hue = index / total_colors # Spread hues in range [0,1]
117
  saturation = 0.65 # Keep colors vivid
118
- lightness = 0.75 # Balanced brightness
119
-
120
  # Convert HSL to RGB
121
  r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
122
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
123
-
124
- return f'#{r:02x}{g:02x}{b:02x}' # Convert to hex
125
 
126
 
127
  def assign_colors_to_entities(entities):
@@ -130,12 +151,15 @@ def assign_colors_to_entities(entities):
130
  entities_colors = []
131
  for index, entity in enumerate(entities):
132
  color = generate_color(index, total_colors)
133
-
134
  # append color and index to entities_colors
135
- entities_colors.append({"color": color, "input": entity[0], "source": entity[1]})
136
-
 
 
137
  return entities_colors
138
 
 
139
  def highlight_entities(text1, text2):
140
  if text1 == "" or text2 == "":
141
  return []
@@ -154,49 +178,62 @@ def highlight_entities(text1, text2):
154
  return entities_with_colors
155
 
156
 
157
- def apply_highlight(text, entities_with_colors, key="input", count = 0):
158
  if entities_with_colors == []:
159
  return text, []
160
-
161
  all_starts = []
162
  all_ends = []
163
  highlighted_text = ""
164
  temp_text = text
165
  for index, entity in enumerate(entities_with_colors):
166
  highlighted_text = ""
167
-
168
- # find a list of starts and ends of entity in text:
169
  # starts = [m.start() for m in re.finditer(entity[key], temp_text)]
170
  # ends = [m.end() for m in re.finditer(entity[key], temp_text)]
171
- starts =[]
172
  ends = []
173
  # "\b" is for bound a word
174
- for m in re.finditer(r"\b" + re.escape(entity[key]) + r"\b", temp_text):
 
 
 
175
  starts.append(m.start())
176
  ends.append(m.end())
177
-
178
  all_starts.extend(starts)
179
  all_ends.extend(ends)
180
-
181
  color = entities_with_colors[index]["color"]
182
- entity_color = lighten_color(color, factor=2.2) # Lightened color for background text
183
- label_color = darken_color(entity_color, factor=0.7) # Darker color for background label (index)
184
-
 
 
 
 
 
 
185
  # Apply highlighting to each entity
186
  prev_end = 0
187
  for start, end in zip(starts, ends):
188
  # Append non-highlighted text
189
  highlighted_text += temp_text[prev_end:start]
190
-
191
  # Style the index as a label
192
- index_label = (f'<span_style="background-color:{label_color};color:white;'
193
- f'padding:1px_4px;border-radius:4px;font-size:12px;'
194
- f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>')
195
-
 
 
196
  # Append highlighted text with index label
197
- highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'
198
- f'border-radius:3px;font-size:14px;display:inline-block;">'
199
- f'{index_label}{temp_text[start:end]}</span>\n')
 
 
200
  prev_end = end
201
  highlighted_text += temp_text[prev_end:]
202
  temp_text = highlighted_text
@@ -206,6 +243,7 @@ def apply_highlight(text, entities_with_colors, key="input", count = 0):
206
  highlight_idx_list = get_index_list(highlighted_text)
207
  return highlighted_text, highlight_idx_list
208
 
 
209
  def get_index_list(highlighted_text):
210
  """
211
  Generates a list of indices between corresponding start and end indices.
@@ -216,7 +254,7 @@ def get_index_list(highlighted_text):
216
 
217
  Returns:
218
  A list containing all indices within the specified ranges.
219
- Returns an empty list if the input is invalid (e.g., different lengths,
220
  end < start, etc.).
221
  """
222
  highlighted_index = []
@@ -226,22 +264,24 @@ def get_index_list(highlighted_text):
226
  start_index = index
227
  if word.endswith("</span>"):
228
  end_index = index
229
-
230
  highlighted_index.extend(list(range(start_index, end_index + 1)))
231
 
232
  return highlighted_index
233
 
 
234
  def extract_entities(text):
235
  output = ner_pipeline(text)
236
  words = extract_words(output)
237
  words = combine_subwords(words)
238
-
239
- # extract word in each entity and assign to a list of entities, connect words if there is no space between them
240
- entities = []
 
241
  for entity in words:
242
  if entity not in entities:
243
  entities.append(entity)
244
-
245
  return entities
246
 
247
 
@@ -275,8 +315,12 @@ def combine_subwords(word_list):
275
  i = 0
276
  while i < len(word_list):
277
  if word_list[i].startswith("##"):
278
- result[-1] += word_list[i][2:] # Remove "##" and append to the previous word
279
- elif i < len(word_list) - 2 and word_list[i + 1] == "-": # Combine hyphenated words
 
 
 
 
280
  result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
281
  i += 2 # Skip the next two words
282
  else:
@@ -286,44 +330,57 @@ def combine_subwords(word_list):
286
 
287
 
288
  original_text = """
289
- Title: UK pledges support for Ukraine with 100-year pact
290
- Content: Sir Keir Starmer has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east. Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
 
 
 
 
 
 
 
 
 
 
 
291
  """
292
  compared_text = """
293
  Title: Japan pledges support for Ukraine with 100-year pact
294
- Content: A leading Japanese figure has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where they signed a "landmark" 100-year pact with the war-stricken country. The visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east. Zelensky praised Japan's commitment on Thursday, amid wider concerns that the next US President, who is set to take office on Monday, could potentially reduce aid.
 
 
 
 
 
 
 
 
 
 
 
295
  """
296
  if __name__ == "__main__":
297
- # text = "The Saudi authorities, I am told, are currently working flat out" \
298
- # "to collate everything they have on the Magdeburg market suspect," \
299
- # "Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \
300
- # "investigation"
301
- # print(extract_entities(text))
302
-
303
-
304
  with gr.Blocks() as demo:
305
  gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
306
  text1_input = gr.Textbox(
307
- label="Paragraph 1",
308
- lines=5,
309
  value=original_text,
310
  )
311
  text2_input = gr.Textbox(
312
- label="Paragraph 2",
313
- lines=5,
314
  value=compared_text,
315
  )
316
  submit_button = gr.Button("Highlight Matches")
317
- output1 = gr.HTML("<br>"*10)
318
- output2 = gr.HTML("<br>"*10)
319
-
320
-
321
  submit_button.click(
322
  fn=highlight_entities,
323
  inputs=[text1_input, text2_input],
324
- outputs=[output1, output2]
325
  )
326
-
327
  # Launch the Gradio app
328
  demo.launch()
329
-
 
1
  import colorsys
2
  import json
3
+ import os
4
  import re
5
+
6
+ import gradio as gr
7
  import openai
8
  from dotenv import load_dotenv
 
9
  from transformers import pipeline
 
10
 
11
  ner_pipeline = pipeline("ner")
12
 
13
  load_dotenv()
14
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
15
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
16
+ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
17
 
18
  client = openai.AzureOpenAI(
19
+ api_version="2024-05-01-preview", # AZURE_OPENAI_API_VERSION,
20
+ api_key=AZURE_OPENAI_API_KEY,
21
+ azure_endpoint=AZURE_OPENAI_ENDPOINT,
22
+ )
23
 
24
 
25
+ def extract_entities_gpt(
26
+ original_text,
27
+ compared_text,
28
+ text_generation_model="o1-mini",
29
+ ):
30
  # "gpt-4o-mini" or "o1-mini"
31
  # Generate text using the selected models
32
  prompt = f"""
33
+ Compare the ORIGINAL TEXT and the COMPARED TEXT.
34
+ Find entity pairs with significantly different meanings after paraphrasing.
35
+ Focus only on these significantly changed entities. These include:
36
+ * **Numerical changes:** e.g., "five" -> "ten," "10%" -> "50%"
37
+ * **Time changes:** e.g., "Monday" -> "Sunday," "10th" -> "21st"
38
+ * **Name changes:** e.g., "Tokyo" -> "New York," "Japan" -> "Japanese"
39
+ * **Opposite meanings:** e.g., "increase" -> "decrease," "good" -> "bad"
40
+ * **Semantically different words:** e.g., "car" -> "truck," "walk" -> "run"
41
+
42
+ Exclude entities where the meaning remains essentially the same,
43
+ even if the wording is different
44
+ (e.g., "big" changed to "large," "house" changed to "residence").
45
+ Also exclude purely stylistic changes that don't affect the core meaning.
46
+
47
+ Output the extracted entity pairs, one pair per line,
48
+ in the following JSON-like list format without wrapping characters:
49
  [
50
  ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
51
  ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
 
59
  # COMPARED TEXT:
60
  {compared_text}
61
  """
62
+
63
  # Generate text using the text generation model
64
+ # Generate text using the selected model
65
  try:
66
  response = client.chat.completions.create(
67
+ model=text_generation_model,
68
+ messages=[{"role": "user", "content": prompt}],
69
  )
70
+
71
  res = response.choices[0].message.content
72
 
73
  except openai.OpenAIError as e:
74
  print(f"Error interacting with OpenAI API: {e}")
75
+ res = ""
76
 
77
  return res
78
+
79
+
80
  def read_json(json_string) -> list[list[str]]:
81
  try:
82
  entities = json.loads(json_string)
 
85
  for inner_list in entities:
86
  if inner_list not in unique_entities:
87
  unique_entities.append(inner_list)
88
+
89
  return unique_entities
90
 
91
  except json.JSONDecodeError as e:
92
  print(f"Error decoding JSON: {e}")
93
  return []
94
 
95
+
96
  def lighten_color(hex_color, factor=1.8):
97
  """Lightens a HEX color by increasing its brightness in HSV space."""
98
 
99
  hex_color = hex_color.lstrip("#")
100
+ r, g, b = (
101
+ int(hex_color[0:2], 16),
102
+ int(hex_color[2:4], 16),
103
+ int(hex_color[4:6], 16),
104
+ )
105
+
106
  # Convert to HSV
107
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
108
  v = min(1.0, v * factor) # Increase brightness
109
+
110
  # Convert back to HEX
111
+ r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
112
+ return f"#{r:02x}{g:02x}{b:02x}"
113
+
114
 
115
  def darken_color(hex_color, factor=0.7):
116
  """Darkens a hex color by reducing its brightness in the HSV space."""
117
 
118
  hex_color = hex_color.lstrip("#")
119
+ r, g, b = (
120
+ int(hex_color[0:2], 16),
121
+ int(hex_color[2:4], 16),
122
+ int(hex_color[4:6], 16),
123
+ )
124
+
125
  # Convert to HSV to adjust brightness
126
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
127
  v = max(0, v * factor) # Reduce brightness
128
+
129
  # Convert back to HEX
130
+ r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
131
+ return f"#{r:02x}{g:02x}{b:02x}"
132
+
133
 
134
  def generate_color(index, total_colors=20):
135
  """Generates a unique, evenly spaced color for each index using HSL."""
136
 
137
  hue = index / total_colors # Spread hues in range [0,1]
138
  saturation = 0.65 # Keep colors vivid
139
+ lightness = 0.75 # Balanced brightness
140
+
141
  # Convert HSL to RGB
142
  r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
143
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
144
+
145
+ return f"#{r:02x}{g:02x}{b:02x}" # Convert to hex
146
 
147
 
148
  def assign_colors_to_entities(entities):
 
151
  entities_colors = []
152
  for index, entity in enumerate(entities):
153
  color = generate_color(index, total_colors)
154
+
155
  # append color and index to entities_colors
156
+ entities_colors.append(
157
+ {"color": color, "input": entity[0], "source": entity[1]},
158
+ )
159
+
160
  return entities_colors
161
 
162
+
163
  def highlight_entities(text1, text2):
164
  if text1 == "" or text2 == "":
165
  return []
 
178
  return entities_with_colors
179
 
180
 
181
+ def apply_highlight(text, entities_with_colors, key="input", count=0):
182
  if entities_with_colors == []:
183
  return text, []
184
+
185
  all_starts = []
186
  all_ends = []
187
  highlighted_text = ""
188
  temp_text = text
189
  for index, entity in enumerate(entities_with_colors):
190
  highlighted_text = ""
191
+
192
+ # find a list of starts and ends of entity in text:
193
  # starts = [m.start() for m in re.finditer(entity[key], temp_text)]
194
  # ends = [m.end() for m in re.finditer(entity[key], temp_text)]
195
+ starts = []
196
  ends = []
197
  # "\b" is for bound a word
198
+ for m in re.finditer(
199
+ r"\b" + re.escape(entity[key]) + r"\b",
200
+ temp_text,
201
+ ):
202
  starts.append(m.start())
203
  ends.append(m.end())
204
+
205
  all_starts.extend(starts)
206
  all_ends.extend(ends)
207
+
208
  color = entities_with_colors[index]["color"]
209
+ entity_color = lighten_color(
210
+ color,
211
+ factor=2.2,
212
+ ) # Lightened color for background text
213
+ label_color = darken_color(
214
+ entity_color,
215
+ factor=0.7,
216
+ ) # Darker color for background label (index)
217
+
218
  # Apply highlighting to each entity
219
  prev_end = 0
220
  for start, end in zip(starts, ends):
221
  # Append non-highlighted text
222
  highlighted_text += temp_text[prev_end:start]
223
+
224
  # Style the index as a label
225
+ index_label = (
226
+ f'<span_style="background-color:{label_color};color:white;'
227
+ f"padding:1px_4px;border-radius:4px;font-size:12px;"
228
+ f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>' # noqa: E501
229
+ )
230
+
231
  # Append highlighted text with index label
232
+ highlighted_text += (
233
+ f'\n<span_style="background-color:{entity_color};color:black;'
234
+ f'border-radius:3px;font-size:14px;display:inline-block;">'
235
+ f"{index_label}{temp_text[start:end]}</span>\n"
236
+ )
237
  prev_end = end
238
  highlighted_text += temp_text[prev_end:]
239
  temp_text = highlighted_text
 
243
  highlight_idx_list = get_index_list(highlighted_text)
244
  return highlighted_text, highlight_idx_list
245
 
246
+
247
  def get_index_list(highlighted_text):
248
  """
249
  Generates a list of indices between corresponding start and end indices.
 
254
 
255
  Returns:
256
  A list containing all indices within the specified ranges.
257
+ Returns an empty list if the input is invalid (e.g., different lengths,
258
  end < start, etc.).
259
  """
260
  highlighted_index = []
 
264
  start_index = index
265
  if word.endswith("</span>"):
266
  end_index = index
267
+
268
  highlighted_index.extend(list(range(start_index, end_index + 1)))
269
 
270
  return highlighted_index
271
 
272
+
273
  def extract_entities(text):
274
  output = ner_pipeline(text)
275
  words = extract_words(output)
276
  words = combine_subwords(words)
277
+
278
+ # extract word in each entity and assign to a list of entities,
279
+ # connect words if there is no space between them
280
+ entities = []
281
  for entity in words:
282
  if entity not in entities:
283
  entities.append(entity)
284
+
285
  return entities
286
 
287
 
 
315
  i = 0
316
  while i < len(word_list):
317
  if word_list[i].startswith("##"):
318
+ result[-1] += word_list[i][
319
+ 2:
320
+ ] # Remove "##" and append to the previous word
321
+ elif (
322
+ i < len(word_list) - 2 and word_list[i + 1] == "-"
323
+ ): # Combine hyphenated words
324
  result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
325
  i += 2 # Skip the next two words
326
  else:
 
330
 
331
 
332
  original_text = """
333
+ Title: UK pledges support for Ukraine with 100-year pact
334
+ Content: Sir Keir Starmer has pledged to put Ukraine in the "strongest
335
+ possible position" on a trip to Kyiv where he signed a "landmark"
336
+ 100-year pact with the war-stricken country. The prime minister's
337
+ visit on Thursday was at one point marked by loud blasts and air
338
+ raid sirens after a reported Russian drone attack was intercepted
339
+ by Ukraine's defence systems. Acknowledging the "hello" from Russia,
340
+ Volodymyr Zelensky said Ukraine would send its own "hello back".
341
+ An estimated one million people have been killed or wounded in the
342
+ war so far. As the invasion reaches the end of its third year, Ukraine
343
+ is losing territory in the east. Zelensky praised the UK's commitment
344
+ on Thursday, amid wider concerns that the US President-elect Donald
345
+ Trump, who is set to take office on Monday, could potentially reduce aid.
346
  """
347
  compared_text = """
348
  Title: Japan pledges support for Ukraine with 100-year pact
349
+ Content: A leading Japanese figure has pledged to put Ukraine
350
+ in the "strongest possible position" on a trip to Kyiv where
351
+ they signed a "landmark" 100-year pact with the war-stricken country.
352
+ The visit on Thursday was at one point marked by loud blasts and air
353
+ raid sirens after a reported Russian drone attack was intercepted by
354
+ Ukraine's defence systems. Acknowledging the "hello" from Russia,
355
+ Volodymyr Zelensky said Ukraine would send its own "hello back".
356
+ An estimated one million people have been killed or wounded in the
357
+ war so far. As the invasion reaches the end of its third year, Ukraine
358
+ is losing territory in the east. Zelensky praised Japan's commitment
359
+ on Thursday, amid wider concerns that the next US President, who is
360
+ set to take office on Monday, could potentially reduce aid.
361
  """
362
  if __name__ == "__main__":
 
 
 
 
 
 
 
363
  with gr.Blocks() as demo:
364
  gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
365
  text1_input = gr.Textbox(
366
+ label="Paragraph 1",
367
+ lines=5,
368
  value=original_text,
369
  )
370
  text2_input = gr.Textbox(
371
+ label="Paragraph 2",
372
+ lines=5,
373
  value=compared_text,
374
  )
375
  submit_button = gr.Button("Highlight Matches")
376
+ output1 = gr.HTML("<br>" * 10)
377
+ output2 = gr.HTML("<br>" * 10)
378
+
 
379
  submit_button.click(
380
  fn=highlight_entities,
381
  inputs=[text1_input, text2_input],
382
+ outputs=[output1, output2],
383
  )
384
+
385
  # Launch the Gradio app
386
  demo.launch()
 
src/application/text/helper.py CHANGED
@@ -1,73 +1,72 @@
1
- from collections import Counter
2
- from difflib import SequenceMatcher
3
  import re
4
  import string
5
- from sklearn.feature_extraction.text import TfidfVectorizer
 
 
6
  from nltk.tokenize import word_tokenize
7
  from nltk.util import ngrams
 
8
 
9
 
10
  def clean_text(text):
11
  """Doc cleaning"""
12
- punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~""" # not include , and . due to number
 
 
13
  # Lowering text
14
  text = text.lower()
15
-
16
  # Removing punctuation
17
  text = "".join([c for c in text if c not in punctuations])
18
-
19
  # Removing whitespace and newlines
20
- text = re.sub(r'\s+',' ',text)
21
-
22
  text.replace("Β£", " * ")
23
-
24
  words = text.split()
25
- text = ' '.join(words[:18]) # Join the first 18 words back into a string
26
-
27
  return text
28
 
 
29
  def remove_punctuation(text):
30
  """Remove punctuation from a given text."""
31
  punctuation_without_dot = string.punctuation.replace(".", "")
32
- translator = str.maketrans('', '', punctuation_without_dot)
33
  return text.translate(translator)
34
 
 
35
  def get_keywords(text, num_keywords=5):
36
  """Return top k keywords from a doc using TF-IDF method"""
37
-
38
  # Create a TF-IDF Vectorizer
39
- vectorizer = TfidfVectorizer(stop_words='english')
40
-
41
  # Fit and transform the text
42
  tfidf_matrix = vectorizer.fit_transform([text])
43
-
44
  # Get feature names (words)
45
  feature_names = vectorizer.get_feature_names_out()
46
-
47
  # Get TF-IDF scores
48
  tfidf_scores = tfidf_matrix.toarray()[0]
49
-
50
  # Sort words by TF-IDF score
51
  word_scores = list(zip(feature_names, tfidf_scores))
52
  word_scores.sort(key=lambda x: x[1], reverse=True)
53
-
54
  # Return top keywords
55
  return [word for word, score in word_scores[:num_keywords]]
56
 
57
- """
58
- # Example usage
59
- text = "Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals. Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however this definition is rejected by major AI researchers."
60
- print(f"\n# Input text:\n'{text}'")
61
- print("\n----------------------\n")
62
-
63
- keywords = get_keywords(text)
64
- print("# Top keywords:", keywords)
65
- print("\n----------------------\n")
66
- """
67
 
68
- def get_important_sentences(paragraph: str, keywords: list[str], num_sentences: int = 3) -> list[str]:
 
 
 
 
69
  """
70
- Selects important sentences from a given paragraph based on a list of keywords.
71
 
72
  Args:
73
  paragraph (str): The input paragraph.
@@ -78,8 +77,10 @@ def get_important_sentences(paragraph: str, keywords: list[str], num_sentences:
78
  list: A list of important sentences.
79
  """
80
  # Clean and split the paragraph into sentences
81
- sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', paragraph) if s.strip()]
82
-
 
 
83
  # Calculate the importance score for each sentence
84
  sentence_scores = []
85
  for sentence in sentences:
@@ -87,54 +88,49 @@ def get_important_sentences(paragraph: str, keywords: list[str], num_sentences:
87
  score = 0
88
  words = processed_sentence.lower().split()
89
  word_count = Counter(words)
90
-
91
  for keyword in keywords:
92
  if keyword.lower() in word_count:
93
  score += word_count[keyword.lower()]
94
-
95
  sentence_scores.append((sentence, score))
96
-
97
  # Sort sentences by their scores in descending order
98
  sentence_scores.sort(key=lambda x: x[1], reverse=True)
99
-
100
  # Return the top N sentences
101
  return [sentence for sentence, score in sentence_scores[:num_sentences]]
102
 
103
- """# Example usage
104
- keywords = get_keywords(paragraph)
105
- important_sentences = get_important_sentences(paragraph, keywords)
106
-
107
- print("# Important sentences:")
108
- for i, sentence in enumerate(important_sentences, 1):
109
- print(f"{i}. {sentence}")
110
- print("\n----------------------\n")
111
- """
112
 
113
- def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length: int = 5) -> list[str]:
 
 
 
 
114
  """
115
- Extracts important phrases from a given paragraph based on a list of keywords.
116
  Phrase length is auto-determined, and overlapped parts are less than 20%.
117
 
118
  Args:
119
  paragraph (str): The input paragraph.
120
  keywords (list[str]): List of important keywords.
121
- phrase_length (int): The length of phrases to extract (default is 5 words).
122
 
123
  Returns:
124
  list: A list of important phrases.
125
  """
126
  # Tokenize the paragraph into words
127
  words = word_tokenize(paragraph.lower())
128
-
129
  # Determine phrase length (between 3 and 7 words)
130
  phrase_length = min(max(len(words) // 10, 5), 7)
131
-
132
  # Generate n-grams (phrases) from the paragraph
133
  phrases = list(ngrams(words, phrase_length))
134
-
135
  important_phrases = []
136
  used_indices = set()
137
-
138
  for i, phrase in enumerate(phrases):
139
  # Check if the phrase contains any keyword
140
  if any(keyword.lower() in phrase for keyword in keywords):
@@ -142,33 +138,36 @@ def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length
142
  if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
143
  important_phrases.append(clean_text(" ".join(phrase)))
144
  used_indices.add(i)
145
-
146
  return important_phrases
147
 
 
148
  def extract_equal_text(text1, text2):
149
  def cleanup(text):
150
  text = text.lower()
151
- text = text.translate(str.maketrans('', '', string.punctuation))
152
  return text
153
-
154
  splited_text1 = cleanup(text1).split()
155
  splited_text2 = cleanup(text2).split()
156
-
157
  s = SequenceMatcher(None, splited_text1, splited_text2)
158
-
159
  equal_idx_1 = []
160
  equal_idx_2 = []
161
  text1 = text1.split()
162
  text2 = text2.split()
163
  for tag, i1, i2, j1, j2 in s.get_opcodes():
164
- if tag == 'equal':
165
  equal_idx_1.append({"start": i1, "end": i2})
166
  equal_idx_2.append({"start": j1, "end": j2})
167
  # subtext_1 = " ".join(text1[i1:i2])
168
  # subtext_2 = " ".join(text2[j1:j2])
169
- # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
 
170
  return equal_idx_1, equal_idx_2
171
 
 
172
  def connect_consecutive_indexes(nums):
173
  """
174
  Connects consecutive integers in a list.
@@ -197,11 +196,3 @@ def connect_consecutive_indexes(nums):
197
 
198
  result.append([start, end]) # Add the last range
199
  return result
200
-
201
- """# Example usage
202
- keywords = get_keywords(paragraph)
203
- important_phrases = extract_important_phrases(paragraph, keywords)
204
-
205
- print("# Important phrases:")
206
- for i, phrase in enumerate(important_phrases[:5], 1): # Print top 5 phrases
207
- print(f"{i}. {phrase}")"""
 
 
 
1
  import re
2
  import string
3
+ from collections import Counter
4
+ from difflib import SequenceMatcher
5
+
6
  from nltk.tokenize import word_tokenize
7
  from nltk.util import ngrams
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
 
10
 
11
  def clean_text(text):
12
  """Doc cleaning"""
13
+ # exclude , and . due to number
14
+ punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
15
+
16
  # Lowering text
17
  text = text.lower()
18
+
19
  # Removing punctuation
20
  text = "".join([c for c in text if c not in punctuations])
21
+
22
  # Removing whitespace and newlines
23
+ text = re.sub(r"\s+", " ", text)
24
+
25
  text.replace("Β£", " * ")
26
+
27
  words = text.split()
28
+ text = " ".join(words[:18]) # Join the first 18 words back into a string
29
+
30
  return text
31
 
32
+
33
  def remove_punctuation(text):
34
  """Remove punctuation from a given text."""
35
  punctuation_without_dot = string.punctuation.replace(".", "")
36
+ translator = str.maketrans("", "", punctuation_without_dot)
37
  return text.translate(translator)
38
 
39
+
40
  def get_keywords(text, num_keywords=5):
41
  """Return top k keywords from a doc using TF-IDF method"""
42
+
43
  # Create a TF-IDF Vectorizer
44
+ vectorizer = TfidfVectorizer(stop_words="english")
45
+
46
  # Fit and transform the text
47
  tfidf_matrix = vectorizer.fit_transform([text])
48
+
49
  # Get feature names (words)
50
  feature_names = vectorizer.get_feature_names_out()
51
+
52
  # Get TF-IDF scores
53
  tfidf_scores = tfidf_matrix.toarray()[0]
54
+
55
  # Sort words by TF-IDF score
56
  word_scores = list(zip(feature_names, tfidf_scores))
57
  word_scores.sort(key=lambda x: x[1], reverse=True)
58
+
59
  # Return top keywords
60
  return [word for word, score in word_scores[:num_keywords]]
61
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ def get_important_sentences(
64
+ paragraph: str,
65
+ keywords: list[str],
66
+ num_sentences: int = 3,
67
+ ) -> list[str]:
68
  """
69
+ Selects important sentences based on a list of keywords.
70
 
71
  Args:
72
  paragraph (str): The input paragraph.
 
77
  list: A list of important sentences.
78
  """
79
  # Clean and split the paragraph into sentences
80
+ sentences = [
81
+ s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
82
+ ]
83
+
84
  # Calculate the importance score for each sentence
85
  sentence_scores = []
86
  for sentence in sentences:
 
88
  score = 0
89
  words = processed_sentence.lower().split()
90
  word_count = Counter(words)
91
+
92
  for keyword in keywords:
93
  if keyword.lower() in word_count:
94
  score += word_count[keyword.lower()]
95
+
96
  sentence_scores.append((sentence, score))
97
+
98
  # Sort sentences by their scores in descending order
99
  sentence_scores.sort(key=lambda x: x[1], reverse=True)
100
+
101
  # Return the top N sentences
102
  return [sentence for sentence, score in sentence_scores[:num_sentences]]
103
 
 
 
 
 
 
 
 
 
 
104
 
105
+ def extract_important_phrases(
106
+ paragraph: str,
107
+ keywords: list[str],
108
+ phrase_length: int = 5,
109
+ ) -> list[str]:
110
  """
111
+ Extracts important phrases based on a list of keywords.
112
  Phrase length is auto-determined, and overlapped parts are less than 20%.
113
 
114
  Args:
115
  paragraph (str): The input paragraph.
116
  keywords (list[str]): List of important keywords.
117
+ phrase_length (int): Length of phrases to extract (default: 5 words).
118
 
119
  Returns:
120
  list: A list of important phrases.
121
  """
122
  # Tokenize the paragraph into words
123
  words = word_tokenize(paragraph.lower())
124
+
125
  # Determine phrase length (between 3 and 7 words)
126
  phrase_length = min(max(len(words) // 10, 5), 7)
127
+
128
  # Generate n-grams (phrases) from the paragraph
129
  phrases = list(ngrams(words, phrase_length))
130
+
131
  important_phrases = []
132
  used_indices = set()
133
+
134
  for i, phrase in enumerate(phrases):
135
  # Check if the phrase contains any keyword
136
  if any(keyword.lower() in phrase for keyword in keywords):
 
138
  if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
139
  important_phrases.append(clean_text(" ".join(phrase)))
140
  used_indices.add(i)
141
+
142
  return important_phrases
143
 
144
+
145
  def extract_equal_text(text1, text2):
146
  def cleanup(text):
147
  text = text.lower()
148
+ text = text.translate(str.maketrans("", "", string.punctuation))
149
  return text
150
+
151
  splited_text1 = cleanup(text1).split()
152
  splited_text2 = cleanup(text2).split()
153
+
154
  s = SequenceMatcher(None, splited_text1, splited_text2)
155
+
156
  equal_idx_1 = []
157
  equal_idx_2 = []
158
  text1 = text1.split()
159
  text2 = text2.split()
160
  for tag, i1, i2, j1, j2 in s.get_opcodes():
161
+ if tag == "equal":
162
  equal_idx_1.append({"start": i1, "end": i2})
163
  equal_idx_2.append({"start": j1, "end": j2})
164
  # subtext_1 = " ".join(text1[i1:i2])
165
  # subtext_2 = " ".join(text2[j1:j2])
166
+ # print(f'{tag:7} a[{i1:2}:{i2:2}]
167
+ # --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
168
  return equal_idx_1, equal_idx_2
169
 
170
+
171
  def connect_consecutive_indexes(nums):
172
  """
173
  Connects consecutive integers in a list.
 
196
 
197
  result.append([start, end]) # Add the last range
198
  return result
 
 
 
 
 
 
 
 
src/application/text/highlight_text.py CHANGED
@@ -1,36 +1,45 @@
1
- import gradio as gr
2
  import colorsys
3
- from functools import partial
4
- import random
5
 
6
 
7
  def lighten_color(hex_color, factor=1.8):
8
  """Lightens a HEX color by increasing its brightness in HSV space."""
9
 
10
  hex_color = hex_color.lstrip("#")
11
- r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
12
-
 
 
 
 
13
  # Convert to HSV
14
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
15
  v = min(1.0, v * factor) # Increase brightness
16
-
17
  # Convert back to HEX
18
- r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
19
- return f'#{r:02x}{g:02x}{b:02x}'
 
20
 
21
  def darken_color(hex_color, factor=0.7):
22
  """Darkens a hex color by reducing its brightness in the HSV space."""
23
 
24
  hex_color = hex_color.lstrip("#")
25
- r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
26
-
 
 
 
 
27
  # Convert to HSV to adjust brightness
28
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
29
  v = max(0, v * factor) # Reduce brightness
30
-
31
  # Convert back to HEX
32
- r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
33
- return f'#{r:02x}{g:02x}{b:02x}'
 
34
 
35
  # Generate unique colors for pairs
36
  def generate_color(index, total_colors=20):
@@ -38,51 +47,98 @@ def generate_color(index, total_colors=20):
38
 
39
  hue = index / total_colors # Spread hues in range [0,1]
40
  saturation = 0.65 # Keep colors vivid
41
- lightness = 0.75 # Balanced brightness
42
-
43
  # Convert HSL to RGB
44
  r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
45
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
46
-
47
- return f'#{r:02x}{g:02x}{b:02x}' # Convert to hex
 
48
 
49
  def highlight_pairs(text1, text2):
50
  """Highlight matching pairs between two paragraphs"""
51
  # Predefined matching pairs
52
  match_pairs = [
53
- {"index": 1, "text1": "deep learning", "start1": 13, "end1": 26,
54
- "text2": "deep learning", "start2": 12, "end2": 25},
55
- {"index": 2, "text1": "neural networks", "start1": 56, "end1": 71,
56
- "text2": "neural networks", "start2": 68, "end2": 83},
57
- {"index": 3, "text1": "AI research", "start1": 86, "end1": 97,
58
- "text2": "AI research", "start2": 55, "end2": 66},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ]
60
 
61
  # Assign unique colors to each index
62
- pair_colors = {pair["index"]: generate_color(pair["index"], total_colors=len(match_pairs)) for pair in match_pairs}
63
-
64
-
65
- def apply_highlight(text, pairs, key_start, key_end, key_index, pair_colors):
 
 
 
 
 
 
 
 
 
 
 
 
66
  highlighted_text = ""
67
  prev_end = 0
68
-
69
  for pair in sorted(pairs, key=lambda x: x[key_start]):
70
  start, end, index = pair[key_start], pair[key_end], pair[key_index]
71
- color = pair_colors.get(index, "#ddd") # Default color if not found
72
- color = lighten_color(color, factor=2.2) # Lightened color for background text
73
- label_color = darken_color(color, factor=0.7) # Make label color darker
 
 
 
 
 
 
 
 
 
74
 
75
  # Style the index as a label
76
- index_label = (f'<span style="background-color:{label_color}; color:white; '
77
- f'padding:1px 4px; border-radius:4px; font-size:12px; '
78
- f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>')
 
 
79
 
80
  # Append non-highlighted text
81
  highlighted_text += text[prev_end:start]
82
  # Append highlighted text with index label
83
- highlighted_text += (f'<span style="background-color:{color}; '
84
- f'border-radius:3px; font-size:14px; display:inline-block;">'
85
- f'{index_label} {text[start:end]}</span>')
 
 
86
  prev_end = end
87
 
88
  # Append remaining text
@@ -90,36 +146,57 @@ def highlight_pairs(text1, text2):
90
  return highlighted_text
91
 
92
  # Apply highlighting to both paragraphs using the global MATCH_PAIRS
93
- highlighted_text1 = apply_highlight(text1, match_pairs, "start1", "end1", "index", pair_colors)
94
- highlighted_text2 = apply_highlight(text2, match_pairs, "start2", "end2", "index", pair_colors)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  return highlighted_text1, highlighted_text2
97
 
98
- if __name__ == '__main__':
 
99
  # Create Gradio Interface
100
  text1 = ""
101
-
102
  with gr.Blocks() as demo:
103
  gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
104
  text1_input = gr.Textbox(
105
- label="Paragraph 1",
106
- lines=5,
107
- value="The field of deep learning is advancing rapidly. Modern neural networks are improving AI research significantly."
 
 
 
108
  )
109
  text2_input = gr.Textbox(
110
- label="Paragraph 2",
111
- lines=5,
112
- value="Advances in deep learning have led to breakthroughs in AI research. Neural networks are at the core of these innovations"
 
 
 
113
  )
114
  output1 = gr.HTML()
115
  output2 = gr.HTML()
116
  submit_button = gr.Button("Highlight Matches")
117
-
118
  submit_button.click(
119
  fn=highlight_pairs,
120
  inputs=[text1_input, text2_input],
121
- outputs=[output1, output2]
122
  )
123
-
124
  # Launch the Gradio app
125
  demo.launch()
 
 
1
  import colorsys
2
+
3
+ import gradio as gr
4
 
5
 
6
  def lighten_color(hex_color, factor=1.8):
7
  """Lightens a HEX color by increasing its brightness in HSV space."""
8
 
9
  hex_color = hex_color.lstrip("#")
10
+ r, g, b = (
11
+ int(hex_color[0:2], 16),
12
+ int(hex_color[2:4], 16),
13
+ int(hex_color[4:6], 16),
14
+ )
15
+
16
  # Convert to HSV
17
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
18
  v = min(1.0, v * factor) # Increase brightness
19
+
20
  # Convert back to HEX
21
+ r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
22
+ return f"#{r:02x}{g:02x}{b:02x}"
23
+
24
 
25
  def darken_color(hex_color, factor=0.7):
26
  """Darkens a hex color by reducing its brightness in the HSV space."""
27
 
28
  hex_color = hex_color.lstrip("#")
29
+ r, g, b = (
30
+ int(hex_color[0:2], 16),
31
+ int(hex_color[2:4], 16),
32
+ int(hex_color[4:6], 16),
33
+ )
34
+
35
  # Convert to HSV to adjust brightness
36
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
37
  v = max(0, v * factor) # Reduce brightness
38
+
39
  # Convert back to HEX
40
+ r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
41
+ return f"#{r:02x}{g:02x}{b:02x}"
42
+
43
 
44
  # Generate unique colors for pairs
45
  def generate_color(index, total_colors=20):
 
47
 
48
  hue = index / total_colors # Spread hues in range [0,1]
49
  saturation = 0.65 # Keep colors vivid
50
+ lightness = 0.75 # Balanced brightness
51
+
52
  # Convert HSL to RGB
53
  r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
54
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
55
+
56
+ return f"#{r:02x}{g:02x}{b:02x}" # Convert to hex
57
+
58
 
59
  def highlight_pairs(text1, text2):
60
  """Highlight matching pairs between two paragraphs"""
61
  # Predefined matching pairs
62
  match_pairs = [
63
+ {
64
+ "index": 1,
65
+ "text1": "deep learning",
66
+ "start1": 13,
67
+ "end1": 26,
68
+ "text2": "deep learning",
69
+ "start2": 12,
70
+ "end2": 25,
71
+ },
72
+ {
73
+ "index": 2,
74
+ "text1": "neural networks",
75
+ "start1": 56,
76
+ "end1": 71,
77
+ "text2": "neural networks",
78
+ "start2": 68,
79
+ "end2": 83,
80
+ },
81
+ {
82
+ "index": 3,
83
+ "text1": "AI research",
84
+ "start1": 86,
85
+ "end1": 97,
86
+ "text2": "AI research",
87
+ "start2": 55,
88
+ "end2": 66,
89
+ },
90
  ]
91
 
92
  # Assign unique colors to each index
93
+ pair_colors = {
94
+ pair["index"]: generate_color(
95
+ pair["index"],
96
+ total_colors=len(match_pairs),
97
+ )
98
+ for pair in match_pairs
99
+ }
100
+
101
+ def apply_highlight(
102
+ text,
103
+ pairs,
104
+ key_start,
105
+ key_end,
106
+ key_index,
107
+ pair_colors,
108
+ ):
109
  highlighted_text = ""
110
  prev_end = 0
111
+
112
  for pair in sorted(pairs, key=lambda x: x[key_start]):
113
  start, end, index = pair[key_start], pair[key_end], pair[key_index]
114
+ color = pair_colors.get(
115
+ index,
116
+ "#ddd",
117
+ ) # Default color if not found
118
+ color = lighten_color(
119
+ color,
120
+ factor=2.2,
121
+ ) # Lightened color for background text
122
+ label_color = darken_color(
123
+ color,
124
+ factor=0.7,
125
+ ) # Make label color darker
126
 
127
  # Style the index as a label
128
+ index_label = (
129
+ f'<span style="background-color:{label_color}; color:white; '
130
+ f"padding:1px 4px; border-radius:4px; font-size:12px; "
131
+ f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>' # noqa: E501
132
+ )
133
 
134
  # Append non-highlighted text
135
  highlighted_text += text[prev_end:start]
136
  # Append highlighted text with index label
137
+ highlighted_text += (
138
+ f'<span style="background-color:{color}; '
139
+ f'border-radius:3px; font-size:14px; display:inline-block;">'
140
+ f"{index_label} {text[start:end]}</span>"
141
+ )
142
  prev_end = end
143
 
144
  # Append remaining text
 
146
  return highlighted_text
147
 
148
  # Apply highlighting to both paragraphs using the global MATCH_PAIRS
149
+ highlighted_text1 = apply_highlight(
150
+ text1,
151
+ match_pairs,
152
+ "start1",
153
+ "end1",
154
+ "index",
155
+ pair_colors,
156
+ )
157
+ highlighted_text2 = apply_highlight(
158
+ text2,
159
+ match_pairs,
160
+ "start2",
161
+ "end2",
162
+ "index",
163
+ pair_colors,
164
+ )
165
 
166
  return highlighted_text1, highlighted_text2
167
 
168
+
169
+ if __name__ == "__main__":
170
  # Create Gradio Interface
171
  text1 = ""
172
+
173
  with gr.Blocks() as demo:
174
  gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
175
  text1_input = gr.Textbox(
176
+ label="Paragraph 1",
177
+ lines=5,
178
+ value="""
179
+ The field of deep learning is advancing rapidly.
180
+ Modern neural networks are improving AI research significantly.
181
+ """,
182
  )
183
  text2_input = gr.Textbox(
184
+ label="Paragraph 2",
185
+ lines=5,
186
+ value="""
187
+ Advances in deep learning have led to breakthroughs in AI research.
188
+ Neural networks are at the core of these innovations",
189
+ """,
190
  )
191
  output1 = gr.HTML()
192
  output2 = gr.HTML()
193
  submit_button = gr.Button("Highlight Matches")
194
+
195
  submit_button.click(
196
  fn=highlight_pairs,
197
  inputs=[text1_input, text2_input],
198
+ outputs=[output1, output2],
199
  )
200
+
201
  # Launch the Gradio app
202
  demo.launch()
src/application/text/model_detection.py CHANGED
@@ -19,7 +19,7 @@ def detect_text_by_ai_model(
19
  """
20
  Model: chatgpt_detector_roberta
21
  Ref: https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta
22
-
23
  Detects if text is human or machine generated.
24
 
25
  Returns:
 
19
  """
20
  Model: chatgpt_detector_roberta
21
  Ref: https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta
22
+
23
  Detects if text is human or machine generated.
24
 
25
  Returns:
src/application/text/preprocessing.py CHANGED
@@ -1,5 +1,6 @@
1
  from nltk.tokenize import sent_tokenize
2
 
 
3
  def split_into_paragraphs(input_text):
4
  """
5
  Splits input text into sentences by newlines.
@@ -17,6 +18,6 @@ def split_into_paragraphs(input_text):
17
  sentences = []
18
  for paragraph in paragraphs:
19
  paragraph = paragraph.strip()
20
- if paragraph and paragraph != '\n':
21
  sentences.extend(sent_tokenize(paragraph))
22
- return sentences
 
1
  from nltk.tokenize import sent_tokenize
2
 
3
+
4
  def split_into_paragraphs(input_text):
5
  """
6
  Splits input text into sentences by newlines.
 
18
  sentences = []
19
  for paragraph in paragraphs:
20
  paragraph = paragraph.strip()
21
+ if paragraph and paragraph != "\n":
22
  sentences.extend(sent_tokenize(paragraph))
23
+ return sentences
src/application/text/search.py CHANGED
@@ -1,6 +1,7 @@
1
- from collections import Counter
2
  import os
3
  import string
 
 
4
  import requests
5
  from dotenv import load_dotenv
6
  from nltk.corpus import stopwords
@@ -9,27 +10,28 @@ from sklearn.feature_extraction.text import TfidfVectorizer
9
 
10
  from src.application.text.entity import extract_entities
11
 
12
- load_dotenv()
13
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
14
  SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
15
 
 
16
  def search_by_google(
17
- query,
18
  num_results=10,
19
- is_exact_terms = False
20
- ) -> dict:
21
  """
22
  Searches the Google Custom Search Engine for the given query.
23
 
24
  Args:
25
  query: The search query.
26
- is_exact_terms: Whether to use exact terms search (True) or regular search (False).
27
  num_results: The number of results to return (default: 10).
28
 
29
  Returns:
30
- A dictionary containing the search results or None if there was an error.
31
  """
32
-
33
  url = "https://www.googleapis.com/customsearch/v1"
34
  params = {
35
  "key": GOOGLE_API_KEY,
@@ -40,7 +42,7 @@ def search_by_google(
40
  params["exactTerms"] = query
41
  else:
42
  params["q"] = query.replace('"', "")
43
-
44
  response = requests.get(url, params=params)
45
  if response.status_code == 200:
46
  return response.json()
@@ -48,9 +50,11 @@ def search_by_google(
48
  print(f"Error: {response.status_code}, {response.text}")
49
  return None
50
 
 
51
  def get_most_frequent_words(input_text, number_word=32):
52
  """
53
- Gets the top words from the input text, excluding stop words and punctuation.
 
54
 
55
  Args:
56
  input_text: The input text as a string.
@@ -65,18 +69,21 @@ def get_most_frequent_words(input_text, number_word=32):
65
 
66
  words = word_tokenize(input_text.lower()) # Tokenize and lowercase
67
 
68
- stop_words = set(stopwords.words('english'))
69
- punctuation = set(string.punctuation) # get all punctuation
70
  filtered_words = [
71
- word for word in words
72
- if word.isalnum() and word not in stop_words and word not in punctuation
 
 
 
73
  ]
74
  word_frequencies = Counter(filtered_words)
75
  top_words = word_frequencies.most_common(number_word)
76
-
77
  for top_word in top_words:
78
  words.append(top_word[0])
79
-
80
  if len(words) > 32:
81
  search_phrase = " ".join(words[:32])
82
  else:
@@ -84,6 +91,7 @@ def get_most_frequent_words(input_text, number_word=32):
84
 
85
  return search_phrase
86
 
 
87
  def get_chunk(input_text, chunk_length=32, num_chunk=3):
88
  """
89
  Splits the input text into chunks of a specified length.
@@ -94,7 +102,7 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
94
  chunk_length: The desired length of each chunk (in words).
95
 
96
  Returns:
97
- A list of string chunks.
98
  Returns an empty list if input is invalid.
99
  """
100
  if not isinstance(input_text, str):
@@ -112,25 +120,26 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
112
 
113
  return chunks
114
 
 
115
  def get_keywords(text, num_keywords=5):
116
  """Return top k keywords from a doc using TF-IDF method"""
117
-
118
  # Create a TF-IDF Vectorizer
119
- vectorizer = TfidfVectorizer(stop_words='english')
120
-
121
  # Fit and transform the text
122
  tfidf_matrix = vectorizer.fit_transform([text])
123
-
124
  # Get feature names (words)
125
  feature_names = vectorizer.get_feature_names_out()
126
-
127
  # Get TF-IDF scores
128
  tfidf_scores = tfidf_matrix.toarray()[0]
129
-
130
  # Sort words by TF-IDF score
131
  word_scores = list(zip(feature_names, tfidf_scores))
132
  word_scores.sort(key=lambda x: x[1], reverse=True)
133
-
134
  # Return top keywords
135
  return [word for word, score in word_scores[:num_keywords]]
136
 
@@ -150,29 +159,30 @@ def generate_search_phrases(input_text):
150
  """
151
  if not isinstance(input_text, str):
152
  return []
153
-
154
  search_phrases = []
155
-
156
  # Method 1: Get most frequent words
157
  search_phrases.append(get_most_frequent_words(input_text))
158
-
159
  # Method 2: Get the whole text
160
  search_phrases.append(input_text)
161
-
162
  # Method 3: Split text by chunks
163
  search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
164
-
165
  # Method 4: Get most identities and key words
166
  entities = extract_entities(input_text)
167
  text_without_entities = remove_identities_from_text(input_text, entities)
168
  print(f"text_without_entities: {text_without_entities}")
169
  search_phrases.append(text_without_entities)
170
- #keywords = get_keywords(input_text, 16)
171
- #search_phrase = " ".join(entities) + " " + " ".join(keywords)
172
  # search_phrases.append(search_phrase) # TODO: for demo purposes
173
-
174
  return search_phrases
175
 
 
176
  def remove_identities_from_text(input_text, entities):
177
  """
178
  Removes entities from the input text.
@@ -183,5 +193,5 @@ def remove_identities_from_text(input_text, entities):
183
  """
184
  for entity in entities:
185
  input_text = input_text.replace(entity, "")
186
-
187
  return input_text
 
 
1
  import os
2
  import string
3
+ from collections import Counter
4
+
5
  import requests
6
  from dotenv import load_dotenv
7
  from nltk.corpus import stopwords
 
10
 
11
  from src.application.text.entity import extract_entities
12
 
13
+ load_dotenv()
14
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
15
  SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
16
 
17
+
18
  def search_by_google(
19
+ query,
20
  num_results=10,
21
+ is_exact_terms=False,
22
+ ) -> dict:
23
  """
24
  Searches the Google Custom Search Engine for the given query.
25
 
26
  Args:
27
  query: The search query.
28
+ is_exact_terms: Whether to use exact terms search (True) or not.
29
  num_results: The number of results to return (default: 10).
30
 
31
  Returns:
32
+ A dict containing the search results or None if there was an error.
33
  """
34
+
35
  url = "https://www.googleapis.com/customsearch/v1"
36
  params = {
37
  "key": GOOGLE_API_KEY,
 
42
  params["exactTerms"] = query
43
  else:
44
  params["q"] = query.replace('"', "")
45
+
46
  response = requests.get(url, params=params)
47
  if response.status_code == 200:
48
  return response.json()
 
50
  print(f"Error: {response.status_code}, {response.text}")
51
  return None
52
 
53
+
54
  def get_most_frequent_words(input_text, number_word=32):
55
  """
56
+ Gets the top words from the input text,
57
+ excluding stop words and punctuation.
58
 
59
  Args:
60
  input_text: The input text as a string.
 
69
 
70
  words = word_tokenize(input_text.lower()) # Tokenize and lowercase
71
 
72
+ stop_words = set(stopwords.words("english"))
73
+ punctuation = set(string.punctuation) # get all punctuation
74
  filtered_words = [
75
+ word
76
+ for word in words
77
+ if word.isalnum()
78
+ and word not in stop_words
79
+ and word not in punctuation
80
  ]
81
  word_frequencies = Counter(filtered_words)
82
  top_words = word_frequencies.most_common(number_word)
83
+
84
  for top_word in top_words:
85
  words.append(top_word[0])
86
+
87
  if len(words) > 32:
88
  search_phrase = " ".join(words[:32])
89
  else:
 
91
 
92
  return search_phrase
93
 
94
+
95
  def get_chunk(input_text, chunk_length=32, num_chunk=3):
96
  """
97
  Splits the input text into chunks of a specified length.
 
102
  chunk_length: The desired length of each chunk (in words).
103
 
104
  Returns:
105
+ A list of string chunks.
106
  Returns an empty list if input is invalid.
107
  """
108
  if not isinstance(input_text, str):
 
120
 
121
  return chunks
122
 
123
+
124
  def get_keywords(text, num_keywords=5):
125
  """Return top k keywords from a doc using TF-IDF method"""
126
+
127
  # Create a TF-IDF Vectorizer
128
+ vectorizer = TfidfVectorizer(stop_words="english")
129
+
130
  # Fit and transform the text
131
  tfidf_matrix = vectorizer.fit_transform([text])
132
+
133
  # Get feature names (words)
134
  feature_names = vectorizer.get_feature_names_out()
135
+
136
  # Get TF-IDF scores
137
  tfidf_scores = tfidf_matrix.toarray()[0]
138
+
139
  # Sort words by TF-IDF score
140
  word_scores = list(zip(feature_names, tfidf_scores))
141
  word_scores.sort(key=lambda x: x[1], reverse=True)
142
+
143
  # Return top keywords
144
  return [word for word, score in word_scores[:num_keywords]]
145
 
 
159
  """
160
  if not isinstance(input_text, str):
161
  return []
162
+
163
  search_phrases = []
164
+
165
  # Method 1: Get most frequent words
166
  search_phrases.append(get_most_frequent_words(input_text))
167
+
168
  # Method 2: Get the whole text
169
  search_phrases.append(input_text)
170
+
171
  # Method 3: Split text by chunks
172
  search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
173
+
174
  # Method 4: Get most identities and key words
175
  entities = extract_entities(input_text)
176
  text_without_entities = remove_identities_from_text(input_text, entities)
177
  print(f"text_without_entities: {text_without_entities}")
178
  search_phrases.append(text_without_entities)
179
+ # keywords = get_keywords(input_text, 16)
180
+ # search_phrase = " ".join(entities) + " " + " ".join(keywords)
181
  # search_phrases.append(search_phrase) # TODO: for demo purposes
182
+
183
  return search_phrases
184
 
185
+
186
  def remove_identities_from_text(input_text, entities):
187
  """
188
  Removes entities from the input text.
 
193
  """
194
  for entity in entities:
195
  input_text = input_text.replace(entity, "")
196
+
197
  return input_text
src/application/text/search_detection.py CHANGED
@@ -1,28 +1,33 @@
1
  import string
2
  import warnings
3
- warnings.simplefilter(action='ignore', category=FutureWarning)
4
 
5
- from src.application.text.preprocessing import split_into_paragraphs
6
- from src.application.text.search import generate_search_phrases, search_by_google
7
- from src.application.url_reader import URLReader
8
- from src.application.text.helper import extract_equal_text
9
- import numpy as np
10
  import nltk
 
11
  import torch
12
- from nltk.corpus import stopwords
13
- from sentence_transformers import SentenceTransformer, util
14
- import math
 
15
 
16
- from difflib import SequenceMatcher
 
 
 
 
 
 
 
 
17
 
18
  # Download necessary NLTK data files
19
- nltk.download('punkt', quiet=True)
20
- nltk.download('punkt_tab', quiet=True)
21
- nltk.download('stopwords', quiet=True)
22
 
23
  # load the model
24
- DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
25
- PARAPHASE_MODEL = SentenceTransformer('paraphrase-MiniLM-L6-v2')
26
  PARAPHASE_MODEL.to(DEVICE)
27
 
28
  BATCH_SIZE = 8
@@ -35,63 +40,94 @@ MIN_RATIO_PARAPHRASE_NUM = 0.7
35
  MAX_CHAR_SIZE = 30000
36
 
37
 
38
- def detect_text_by_relative_search(input_text, index, is_support_opposite = False):
 
 
 
 
39
  checked_urls = set()
40
  searched_phrases = generate_search_phrases(input_text[index])
41
 
42
  for candidate in searched_phrases:
43
  search_results = search_by_google(candidate)
44
- urls = [item['link'] for item in search_results.get("items", [])]
45
 
46
  for url in urls[:3]:
47
- if url in checked_urls: # visited url
48
  continue
49
  if "bbc.com" not in url:
50
  continue
51
-
52
  checked_urls.add(url)
53
  print(f"\t\tChecking URL: {url}")
54
-
55
  content = URLReader(url)
56
-
57
  if content.is_extracted is True:
58
  if content.title is None or content.text is None:
59
- print(f"\t\t\t↑↑↑ Title or text not found")
60
  continue
61
-
62
  page_text = content.title + "\n" + content.text
63
  if len(page_text) > MAX_CHAR_SIZE:
64
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
65
  continue
66
- print(f"\t\t\t↑↑↑ Title: {content.title}")
67
- paraphrase, aligned_first_sentences = check_paraphrase(input_text[index], page_text, url)
68
-
 
 
 
 
69
  if paraphrase is False:
70
- return paraphrase, url, aligned_first_sentences, content.images, index
71
-
 
 
 
 
 
 
72
  sub_paraphrase = True
73
- while sub_paraphrase == True:
74
  index += 1
75
  print(f"----search {index} < {len(input_text)}----")
76
  if index >= len(input_text):
77
  print(f"input_text_last: {input_text[-1]}")
78
  break
79
  print(f"input_text: {input_text[index]}")
80
- sub_paraphrase, sub_sentences = check_paraphrase(input_text[index], page_text, url)
 
 
 
 
81
  print(f"sub_paraphrase: {sub_paraphrase}")
82
  print(f"sub_sentences: {sub_sentences}")
83
- if sub_paraphrase == True:
84
- aligned_first_sentences["input_sentence"] += "<br>" + sub_sentences["input_sentence"]
85
- aligned_first_sentences["matched_sentence"] += "<br>" + sub_sentences["matched_sentence"]
86
- aligned_first_sentences["similarity"] += sub_sentences["similarity"]
 
 
 
 
 
 
87
  aligned_first_sentences["similarity"] /= 2
88
-
89
  print(f"paraphrase: {paraphrase}")
90
  print(f"aligned_first_sentences: {aligned_first_sentences}")
91
- return paraphrase, url, aligned_first_sentences, content.images, index
92
-
 
 
 
 
 
 
93
  return False, None, [], [], index
94
 
 
95
  def find_text_source(text, text_index, sentences_df):
96
  sentence = {
97
  "input_sentence": text[text_index],
@@ -101,67 +137,94 @@ def find_text_source(text, text_index, sentences_df):
101
  "paraphrase": None,
102
  "url": "",
103
  "group": None,
104
- }
105
  checked_urls = set()
106
  searched_phrases = generate_search_phrases(text[text_index])
107
 
108
  for candidate in searched_phrases:
109
  search_results = search_by_google(candidate)
110
- urls = [item['link'] for item in search_results.get("items", [])]
111
 
112
  for url in urls[:3]:
113
- if url in checked_urls: # visited url
114
  continue
115
  if "bbc.com" not in url:
116
  continue
117
-
118
  checked_urls.add(url)
119
  print(f"\t\tChecking URL: {url}")
120
-
121
  content = URLReader(url)
122
-
123
  if content.is_extracted is True:
124
  if content.title is None or content.text is None:
125
- print(f"\t\t\t↑↑↑ Title or text not found")
126
  continue
127
-
128
  page_text = content.title + "\n" + content.text
129
  if len(page_text) > MAX_CHAR_SIZE:
130
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
131
  continue
132
- print(f"\t\t\t↑↑↑ Title: {content.title}")
133
- paraphrase, aligned_sentence = check_paraphrase(text, page_text, url)
134
-
 
 
 
 
135
  # add one more key "group" into aligned_sentence
136
- sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
137
- sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
138
- sentences_df.loc[index, "label"] = aligned_sentence["label"]
139
- sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
140
- sentences_df.loc[index, "url"] = aligned_sentence["url"]
141
-
 
 
 
 
 
 
 
 
142
  if aligned_sentence["paraphrase"] is False:
143
  return paraphrase, sentences_df
144
-
145
- for index, _ in enumerate(sentences_df):
146
- if sentences_df[index]["url"] is not None:
147
  continue
148
-
149
  # find content in new url
150
- _, aligned_sentence = check_paraphrase(text[index], page_text, url)
151
-
 
 
 
 
152
  if aligned_sentence["url"] is not None:
153
  continue
154
-
155
- sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
156
- sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
157
- sentences_df.loc[index, "label"] = aligned_sentence["label"]
158
- sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
159
- sentences_df.loc[index, "url"] = aligned_sentence["url"]
160
-
 
 
 
 
 
 
 
 
 
 
161
  return sentences_df, content.images
162
-
163
  return sentence, []
164
 
 
165
  def longest_common_subsequence(arr1, arr2):
166
  """
167
  Finds the length of the longest common subsequence (contiguous) between
@@ -172,7 +235,7 @@ def longest_common_subsequence(arr1, arr2):
172
  arr2: The second array.
173
 
174
  Returns:
175
- The length of the longest common subsequence.
176
  Returns 0 if either input is invalid.
177
  """
178
 
@@ -182,7 +245,7 @@ def longest_common_subsequence(arr1, arr2):
182
  n = len(arr1)
183
  m = len(arr2)
184
 
185
- if n == 0 or m == 0: #handle empty list
186
  return 0
187
 
188
  # Create table dp with size (n+1) x (m+1)
@@ -200,10 +263,15 @@ def longest_common_subsequence(arr1, arr2):
200
  return max_length
201
 
202
 
203
- def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
204
- min_phrase_sentence_len, verbose=False):
 
 
 
 
 
205
  """
206
- Checks if two sentences are similar based on exact match or
207
  longest common subsequence.
208
 
209
  Args:
@@ -218,7 +286,10 @@ def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
218
  Returns False if input is not valid.
219
  """
220
 
221
- if not isinstance(input_sentence, str) or not isinstance(source_sentence, str):
 
 
 
222
  return False
223
 
224
  input_sentence = input_sentence.strip()
@@ -230,7 +301,10 @@ def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
230
  input_words = input_sentence.split() # split without arguments
231
  source_words = source_sentence.split() # split without arguments
232
 
233
- if input_sentence == source_sentence and len(input_words) >= min_same_sentence_len:
 
 
 
234
  if verbose:
235
  print("Exact match found.")
236
  return True
@@ -251,29 +325,24 @@ def check_paraphrase(input_text, page_text, url):
251
  Args:
252
  input_text: The text to check for paraphrase.
253
  page_text: The text of the web page to compare with.
254
- verbose: If True, print debug information.
255
 
256
  Returns:
257
  A tuple containing:
258
- - is_paraphrase: True if the input text is considered a paraphrase, False otherwise.
259
- - paraphrase_results: A list of dictionaries, each containing:
260
- - input_sentence: The sentence from the input text.
261
- - matched_sentence: The corresponding sentence from the web page (if found).
262
- - similarity: The cosine similarity score between the sentences.
263
- - is_paraphrase_sentence: True if the individual sentence pair meets the paraphrase criteria, False otherwise.
264
  """
265
  is_paraphrase_text = False
266
-
267
  if not isinstance(input_text, str) or not isinstance(page_text, str):
268
  return False, []
269
 
270
  # Extract sentences from input text and web page
271
  # input_sentences = split_into_paragraphs(input_text)
272
  input_sentences = [input_text]
273
-
274
  if not page_text:
275
  return is_paraphrase_text, []
276
-
277
  page_sentences = split_into_paragraphs(page_text)
278
  if not input_sentences or not page_sentences:
279
  return is_paraphrase_text, []
@@ -283,10 +352,18 @@ def check_paraphrase(input_text, page_text, url):
283
  if ", external" in sentence:
284
  additional_sentences.append(sentence.replace(", external", ""))
285
  page_sentences.extend(additional_sentences)
286
-
287
  # Encode sentences into embeddings
288
- embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
289
- embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
 
 
 
 
 
 
 
 
290
 
291
  # Compute cosine similarity matrix
292
  similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
@@ -298,7 +375,7 @@ def check_paraphrase(input_text, page_text, url):
298
  for i, sentence1 in enumerate(input_sentences):
299
  max_sim_index = np.argmax(similarity_matrix[i])
300
  max_similarity = similarity_matrix[i][max_sim_index]
301
-
302
  best_matched_sentence = page_sentences[max_sim_index]
303
  is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
304
 
@@ -321,29 +398,40 @@ def check_paraphrase(input_text, page_text, url):
321
  "url": url,
322
  }
323
 
324
- # Check for individual sentence paraphrase if overall paraphrase not yet found
 
325
  if not is_paraphrase_text and check_sentence(
326
- sentence1, page_sentences[max_sim_index], MIN_SAME_SENTENCE_LEN, MIN_PHRASE_SENTENCE_LEN
 
 
 
327
  ):
328
  is_paraphrase_text = True
329
 
330
- #alignment.append(item)
331
  paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
332
 
333
  # Check if enough sentences are paraphrases
334
-
335
- is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
336
-
 
 
337
  # Method 2: Check if overlapped words between sentences are more than 50%
338
- equal_idx_1, _ = extract_equal_text(input_sentences[0], best_matched_sentence)
 
 
 
339
  matched_count = 0
340
  for index in equal_idx_1:
341
  matched_count += index["end"] - index["start"]
342
- sent = input_sentences[0].translate(str.maketrans('', '', string.punctuation))
 
 
343
  num_words = len(sent.split())
344
  if matched_count > num_words / 2:
345
  is_paraphrase_text = True
346
-
347
  return is_paraphrase_text, alignment
348
 
349
 
@@ -359,10 +447,16 @@ def similarity_ratio(a, b):
359
  A float representing the similarity ratio between 0.0 and 1.0.
360
  Returns 0.0 if either input is None or not a string.
361
  """
362
- if not isinstance(a, str) or not isinstance(b, str) or a is None or b is None:
 
 
 
 
 
363
  return 0.0 # Handle cases where inputs are not strings or None
364
  return SequenceMatcher(None, a, b).ratio()
365
 
 
366
  def check_human(alligned_sentences):
367
  """
368
  Checks if a sufficient number of input sentences are found within
@@ -379,5 +473,5 @@ def check_human(alligned_sentences):
379
  return False
380
 
381
 
382
- if __name__ == '__main__':
383
- pass
 
1
  import string
2
  import warnings
3
+ from difflib import SequenceMatcher
4
 
 
 
 
 
 
5
  import nltk
6
+ import numpy as np
7
  import torch
8
+ from sentence_transformers import (
9
+ SentenceTransformer,
10
+ util,
11
+ )
12
 
13
+ from src.application.text.helper import extract_equal_text
14
+ from src.application.text.preprocessing import split_into_paragraphs
15
+ from src.application.text.search import (
16
+ generate_search_phrases,
17
+ search_by_google,
18
+ )
19
+ from src.application.url_reader import URLReader
20
+
21
+ warnings.simplefilter(action="ignore", category=FutureWarning)
22
 
23
  # Download necessary NLTK data files
24
+ nltk.download("punkt", quiet=True)
25
+ nltk.download("punkt_tab", quiet=True)
26
+ nltk.download("stopwords", quiet=True)
27
 
28
  # load the model
29
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+ PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
31
  PARAPHASE_MODEL.to(DEVICE)
32
 
33
  BATCH_SIZE = 8
 
40
  MAX_CHAR_SIZE = 30000
41
 
42
 
43
+ def detect_text_by_relative_search(
44
+ input_text,
45
+ index,
46
+ is_support_opposite=False,
47
+ ):
48
  checked_urls = set()
49
  searched_phrases = generate_search_phrases(input_text[index])
50
 
51
  for candidate in searched_phrases:
52
  search_results = search_by_google(candidate)
53
+ urls = [item["link"] for item in search_results.get("items", [])]
54
 
55
  for url in urls[:3]:
56
+ if url in checked_urls: # visited url
57
  continue
58
  if "bbc.com" not in url:
59
  continue
60
+
61
  checked_urls.add(url)
62
  print(f"\t\tChecking URL: {url}")
63
+
64
  content = URLReader(url)
65
+
66
  if content.is_extracted is True:
67
  if content.title is None or content.text is None:
68
+ print("\t\t\t↑↑↑ Title or text not found")
69
  continue
70
+
71
  page_text = content.title + "\n" + content.text
72
  if len(page_text) > MAX_CHAR_SIZE:
73
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
74
  continue
75
+ print(f"\t\t\t↑↑↑ Title: {content.title}")
76
+ paraphrase, aligned_first_sentences = check_paraphrase(
77
+ input_text[index],
78
+ page_text,
79
+ url,
80
+ )
81
+
82
  if paraphrase is False:
83
+ return (
84
+ paraphrase,
85
+ url,
86
+ aligned_first_sentences,
87
+ content.images,
88
+ index,
89
+ )
90
+
91
  sub_paraphrase = True
92
+ while sub_paraphrase is True:
93
  index += 1
94
  print(f"----search {index} < {len(input_text)}----")
95
  if index >= len(input_text):
96
  print(f"input_text_last: {input_text[-1]}")
97
  break
98
  print(f"input_text: {input_text[index]}")
99
+ sub_paraphrase, sub_sentences = check_paraphrase(
100
+ input_text[index],
101
+ page_text,
102
+ url,
103
+ )
104
  print(f"sub_paraphrase: {sub_paraphrase}")
105
  print(f"sub_sentences: {sub_sentences}")
106
+ if sub_paraphrase is True:
107
+ aligned_first_sentences["input_sentence"] += (
108
+ "<br>" + sub_sentences["input_sentence"]
109
+ )
110
+ aligned_first_sentences["matched_sentence"] += (
111
+ "<br>" + sub_sentences["matched_sentence"]
112
+ )
113
+ aligned_first_sentences["similarity"] += sub_sentences[
114
+ "similarity"
115
+ ]
116
  aligned_first_sentences["similarity"] /= 2
117
+
118
  print(f"paraphrase: {paraphrase}")
119
  print(f"aligned_first_sentences: {aligned_first_sentences}")
120
+ return (
121
+ paraphrase,
122
+ url,
123
+ aligned_first_sentences,
124
+ content.images,
125
+ index,
126
+ )
127
+
128
  return False, None, [], [], index
129
 
130
+
131
  def find_text_source(text, text_index, sentences_df):
132
  sentence = {
133
  "input_sentence": text[text_index],
 
137
  "paraphrase": None,
138
  "url": "",
139
  "group": None,
140
+ }
141
  checked_urls = set()
142
  searched_phrases = generate_search_phrases(text[text_index])
143
 
144
  for candidate in searched_phrases:
145
  search_results = search_by_google(candidate)
146
+ urls = [item["link"] for item in search_results.get("items", [])]
147
 
148
  for url in urls[:3]:
149
+ if url in checked_urls: # visited url
150
  continue
151
  if "bbc.com" not in url:
152
  continue
153
+
154
  checked_urls.add(url)
155
  print(f"\t\tChecking URL: {url}")
156
+
157
  content = URLReader(url)
158
+
159
  if content.is_extracted is True:
160
  if content.title is None or content.text is None:
161
+ print("\t\t\t↑↑↑ Title or text not found")
162
  continue
163
+
164
  page_text = content.title + "\n" + content.text
165
  if len(page_text) > MAX_CHAR_SIZE:
166
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
167
  continue
168
+ print(f"\t\t\t↑↑↑ Title: {content.title}")
169
+ paraphrase, aligned_sentence = check_paraphrase(
170
+ text,
171
+ page_text,
172
+ url,
173
+ )
174
+
175
  # add one more key "group" into aligned_sentence
176
+ sentences_df.loc[text_index, "input_sentence"] = (
177
+ aligned_sentence["input_sentence"]
178
+ )
179
+ sentences_df.loc[text_index, "matched_sentence"] = (
180
+ aligned_sentence["matched_sentence"]
181
+ )
182
+ sentences_df.loc[text_index, "label"] = aligned_sentence[
183
+ "label"
184
+ ]
185
+ sentences_df.loc[text_index, "similarity"] = aligned_sentence[
186
+ "similarity"
187
+ ]
188
+ sentences_df.loc[text_index, "url"] = aligned_sentence["url"]
189
+
190
  if aligned_sentence["paraphrase"] is False:
191
  return paraphrase, sentences_df
192
+
193
+ for text_index, _ in enumerate(sentences_df):
194
+ if sentences_df[text_index]["url"] is not None:
195
  continue
196
+
197
  # find content in new url
198
+ _, aligned_sentence = check_paraphrase(
199
+ text[text_index],
200
+ page_text,
201
+ url,
202
+ )
203
+
204
  if aligned_sentence["url"] is not None:
205
  continue
206
+
207
+ sentences_df.loc[text_index, "input_sentence"] = (
208
+ aligned_sentence["input_sentence"]
209
+ )
210
+ sentences_df.loc[text_index, "matched_sentence"] = (
211
+ aligned_sentence["matched_sentence"]
212
+ )
213
+ sentences_df.loc[text_index, "label"] = aligned_sentence[
214
+ "label"
215
+ ]
216
+ sentences_df.loc[text_index, "similarity"] = (
217
+ aligned_sentence["similarity"]
218
+ )
219
+ sentences_df.loc[text_index, "url"] = aligned_sentence[
220
+ "url"
221
+ ]
222
+
223
  return sentences_df, content.images
224
+
225
  return sentence, []
226
 
227
+
228
  def longest_common_subsequence(arr1, arr2):
229
  """
230
  Finds the length of the longest common subsequence (contiguous) between
 
235
  arr2: The second array.
236
 
237
  Returns:
238
+ The length of the longest common subsequence.
239
  Returns 0 if either input is invalid.
240
  """
241
 
 
245
  n = len(arr1)
246
  m = len(arr2)
247
 
248
+ if n == 0 or m == 0: # handle empty list
249
  return 0
250
 
251
  # Create table dp with size (n+1) x (m+1)
 
263
  return max_length
264
 
265
 
266
+ def check_sentence(
267
+ input_sentence,
268
+ source_sentence,
269
+ min_same_sentence_len,
270
+ min_phrase_sentence_len,
271
+ verbose=False,
272
+ ):
273
  """
274
+ Checks if two sentences are similar based on exact match or
275
  longest common subsequence.
276
 
277
  Args:
 
286
  Returns False if input is not valid.
287
  """
288
 
289
+ if not isinstance(input_sentence, str) or not isinstance(
290
+ source_sentence,
291
+ str,
292
+ ):
293
  return False
294
 
295
  input_sentence = input_sentence.strip()
 
301
  input_words = input_sentence.split() # split without arguments
302
  source_words = source_sentence.split() # split without arguments
303
 
304
+ if (
305
+ input_sentence == source_sentence
306
+ and len(input_words) >= min_same_sentence_len
307
+ ):
308
  if verbose:
309
  print("Exact match found.")
310
  return True
 
325
  Args:
326
  input_text: The text to check for paraphrase.
327
  page_text: The text of the web page to compare with.
328
+ url
329
 
330
  Returns:
331
  A tuple containing:
332
+
 
 
 
 
 
333
  """
334
  is_paraphrase_text = False
335
+
336
  if not isinstance(input_text, str) or not isinstance(page_text, str):
337
  return False, []
338
 
339
  # Extract sentences from input text and web page
340
  # input_sentences = split_into_paragraphs(input_text)
341
  input_sentences = [input_text]
342
+
343
  if not page_text:
344
  return is_paraphrase_text, []
345
+
346
  page_sentences = split_into_paragraphs(page_text)
347
  if not input_sentences or not page_sentences:
348
  return is_paraphrase_text, []
 
352
  if ", external" in sentence:
353
  additional_sentences.append(sentence.replace(", external", ""))
354
  page_sentences.extend(additional_sentences)
355
+
356
  # Encode sentences into embeddings
357
+ embeddings1 = PARAPHASE_MODEL.encode(
358
+ input_sentences,
359
+ convert_to_tensor=True,
360
+ device=DEVICE,
361
+ )
362
+ embeddings2 = PARAPHASE_MODEL.encode(
363
+ page_sentences,
364
+ convert_to_tensor=True,
365
+ device=DEVICE,
366
+ )
367
 
368
  # Compute cosine similarity matrix
369
  similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
 
375
  for i, sentence1 in enumerate(input_sentences):
376
  max_sim_index = np.argmax(similarity_matrix[i])
377
  max_similarity = similarity_matrix[i][max_sim_index]
378
+
379
  best_matched_sentence = page_sentences[max_sim_index]
380
  is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
381
 
 
398
  "url": url,
399
  }
400
 
401
+ # Check for individual sentence paraphrase
402
+ # if overall paraphrase not yet found
403
  if not is_paraphrase_text and check_sentence(
404
+ sentence1,
405
+ page_sentences[max_sim_index],
406
+ MIN_SAME_SENTENCE_LEN,
407
+ MIN_PHRASE_SENTENCE_LEN,
408
  ):
409
  is_paraphrase_text = True
410
 
411
+ # alignment.append(item)
412
  paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
413
 
414
  # Check if enough sentences are paraphrases
415
+
416
+ is_paraphrase_text = (
417
+ paraphrased_sentence_count > 0
418
+ ) # min_matching_sentences
419
+
420
  # Method 2: Check if overlapped words between sentences are more than 50%
421
+ equal_idx_1, _ = extract_equal_text(
422
+ input_sentences[0],
423
+ best_matched_sentence,
424
+ )
425
  matched_count = 0
426
  for index in equal_idx_1:
427
  matched_count += index["end"] - index["start"]
428
+ sent = input_sentences[0].translate(
429
+ str.maketrans("", "", string.punctuation),
430
+ )
431
  num_words = len(sent.split())
432
  if matched_count > num_words / 2:
433
  is_paraphrase_text = True
434
+
435
  return is_paraphrase_text, alignment
436
 
437
 
 
447
  A float representing the similarity ratio between 0.0 and 1.0.
448
  Returns 0.0 if either input is None or not a string.
449
  """
450
+ if (
451
+ not isinstance(a, str)
452
+ or not isinstance(b, str)
453
+ or a is None
454
+ or b is None
455
+ ):
456
  return 0.0 # Handle cases where inputs are not strings or None
457
  return SequenceMatcher(None, a, b).ratio()
458
 
459
+
460
  def check_human(alligned_sentences):
461
  """
462
  Checks if a sufficient number of input sentences are found within
 
473
  return False
474
 
475
 
476
+ if __name__ == "__main__":
477
+ pass
src/application/url_reader.py CHANGED
@@ -1,31 +1,40 @@
1
  import string
2
- from bs4 import BeautifulSoup
3
- from newspaper import article, ArticleException, ArticleBinaryDataException
4
  import requests
 
 
 
 
 
 
5
 
6
  # TODO: move this to a config file
7
- MAX_URL_SIZE = 2000000 # ~2MB
8
- class URLReader():
9
- def __init__(self, url: string, newspaper: bool=True):
 
 
10
  self.url = url
11
  self.text = None # string
12
  self.title = None # string
13
  self.images = None # list of Image objects
14
  self.top_image = None # Image object
15
  self.is_extracted = False
16
-
17
  url_size = self.get_size()
18
- if url_size == None or url_size > MAX_URL_SIZE:
19
  return
20
- else:
21
  self.is_extracted = True
22
-
23
- self.newspaper = newspaper # True if using newspaper4k, False if using BS
 
 
24
  if self.newspaper is True:
25
  self.extract_content_newspaper()
26
  else:
27
  self.extract_content_bs()
28
-
29
  def extract_content_newspaper(self):
30
  """
31
  Use newspaper4k to extracts content from a URL
@@ -36,20 +45,20 @@ class URLReader():
36
  Returns:
37
  The extracted content (title, text, images)
38
  """
39
-
40
  try:
41
  response = requests.get(self.url)
42
- response.raise_for_status() # Raise exception for unsuccessful requests
43
  except requests.exceptions.RequestException as e:
44
  print(f"Error fetching URL: {e}")
45
  return None
46
-
47
  try:
48
  news = article(url=self.url, fetch_images=True)
49
  except (ArticleException, ArticleBinaryDataException) as e:
50
  print(f"\t\t↑↑↑ Error downloading article: {e}")
51
  return None
52
-
53
  self.title = news.title
54
  self.text = news.text
55
  self.images = list(set(news.images)) # Remove duplicates
@@ -61,30 +70,30 @@ class URLReader():
61
  """
62
  response = requests.get(self.url)
63
  response.raise_for_status()
64
-
65
  response.encoding = response.apparent_encoding
66
-
67
  try:
68
  soup = BeautifulSoup(response.content, "html.parser")
69
- except:
70
- print(f"Error parsing HTML content from {self.url}")
71
  return None
72
-
73
  self.title = soup.title.string.strip() if soup.title else None
74
-
75
- image_urls = [img['src'] for img in soup.find_all('img')]
76
  self.images = image_urls
77
  self.top_image = self.images[0]
78
-
79
  # Exclude text within specific elements
80
  for element in soup(["img", "figcaption", "table", "script", "style"]):
81
  element.extract()
82
- #text = soup.get_text(separator="\n")
83
- paragraphs = soup.find_all('p')
84
- text = ' '.join([p.get_text() for p in paragraphs])
85
 
86
  self.text = text
87
-
88
  def get_size(self):
89
  """
90
  Retrieves the size of a URL's content using a HEAD request.
@@ -93,27 +102,32 @@ class URLReader():
93
  url: The URL to check.
94
 
95
  Returns:
96
- The size of the content in bytes, or None if the size cannot be determined
 
97
  (e.g., due to network errors or missing Content-Length header).
98
  """
99
  try:
100
- response = requests.head(self.url, allow_redirects=True, timeout=5) # Add timeout
101
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
 
 
 
 
102
 
103
- content_length = response.headers.get('Content-Length')
104
  if content_length is not None:
105
  return int(content_length)
106
  else:
107
- print(f"\t\t↑↑↑ Content-Length header not found")
108
  return None
109
 
110
  except requests.exceptions.RequestException as e:
111
  print(f"\t\t↑↑↑ Error getting URL size: {e}")
112
  return None
113
-
114
 
115
- if __name__ == '__main__':
 
116
  url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
117
  reader = URLReader(url)
118
  print(f"Title: {reader.title}")
119
- print(f"Text: {reader.text}")
 
1
  import string
2
+
 
3
  import requests
4
+ from bs4 import BeautifulSoup
5
+ from newspaper import (
6
+ ArticleBinaryDataException,
7
+ ArticleException,
8
+ article,
9
+ )
10
 
11
  # TODO: move this to a config file
12
+ MAX_URL_SIZE = 2000000 # ~2MB
13
+
14
+
15
+ class URLReader:
16
+ def __init__(self, url: string, newspaper: bool = True):
17
  self.url = url
18
  self.text = None # string
19
  self.title = None # string
20
  self.images = None # list of Image objects
21
  self.top_image = None # Image object
22
  self.is_extracted = False
23
+
24
  url_size = self.get_size()
25
+ if url_size is None or url_size > MAX_URL_SIZE:
26
  return
27
+ else:
28
  self.is_extracted = True
29
+
30
+ self.newspaper = (
31
+ newspaper # True if using newspaper4k, False if using BS
32
+ )
33
  if self.newspaper is True:
34
  self.extract_content_newspaper()
35
  else:
36
  self.extract_content_bs()
37
+
38
  def extract_content_newspaper(self):
39
  """
40
  Use newspaper4k to extracts content from a URL
 
45
  Returns:
46
  The extracted content (title, text, images)
47
  """
48
+
49
  try:
50
  response = requests.get(self.url)
51
+ response.raise_for_status()
52
  except requests.exceptions.RequestException as e:
53
  print(f"Error fetching URL: {e}")
54
  return None
55
+
56
  try:
57
  news = article(url=self.url, fetch_images=True)
58
  except (ArticleException, ArticleBinaryDataException) as e:
59
  print(f"\t\t↑↑↑ Error downloading article: {e}")
60
  return None
61
+
62
  self.title = news.title
63
  self.text = news.text
64
  self.images = list(set(news.images)) # Remove duplicates
 
70
  """
71
  response = requests.get(self.url)
72
  response.raise_for_status()
73
+
74
  response.encoding = response.apparent_encoding
75
+
76
  try:
77
  soup = BeautifulSoup(response.content, "html.parser")
78
+ except Exception as e:
79
+ print(f"Error parsing HTML content from {self.url}: {e}")
80
  return None
81
+
82
  self.title = soup.title.string.strip() if soup.title else None
83
+
84
+ image_urls = [img["src"] for img in soup.find_all("img")]
85
  self.images = image_urls
86
  self.top_image = self.images[0]
87
+
88
  # Exclude text within specific elements
89
  for element in soup(["img", "figcaption", "table", "script", "style"]):
90
  element.extract()
91
+ # text = soup.get_text(separator="\n")
92
+ paragraphs = soup.find_all("p")
93
+ text = " ".join([p.get_text() for p in paragraphs])
94
 
95
  self.text = text
96
+
97
  def get_size(self):
98
  """
99
  Retrieves the size of a URL's content using a HEAD request.
 
102
  url: The URL to check.
103
 
104
  Returns:
105
+ The size of the content in bytes,
106
+ or None if the size cannot be determined
107
  (e.g., due to network errors or missing Content-Length header).
108
  """
109
  try:
110
+ response = requests.head(
111
+ self.url,
112
+ allow_redirects=True,
113
+ timeout=5,
114
+ ) # Add timeout
115
+ response.raise_for_status() # Raise HTTPError for bad responses
116
 
117
+ content_length = response.headers.get("Content-Length")
118
  if content_length is not None:
119
  return int(content_length)
120
  else:
121
+ print("\t\t↑↑↑ Content-Length header not found")
122
  return None
123
 
124
  except requests.exceptions.RequestException as e:
125
  print(f"\t\t↑↑↑ Error getting URL size: {e}")
126
  return None
 
127
 
128
+
129
+ if __name__ == "__main__":
130
  url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
131
  reader = URLReader(url)
132
  print(f"Title: {reader.title}")
133
+ print(f"Text: {reader.text}")
test.py CHANGED
@@ -1,5 +1,6 @@
1
  import re
2
 
 
3
  def find_entity_spans(entity, text):
4
  """
5
  Finds the start and end indices of whole word entities in text.
@@ -13,10 +14,14 @@ def find_entity_spans(entity, text):
13
  of a found entity. Returns an empty list if no entities are found.
14
  """
15
  spans = []
16
- for m in re.finditer(r"\b" + re.escape(entity) + r"\b", text): # The crucial change
 
 
 
17
  spans.append((m.start(), m.end()))
18
  return spans
19
 
 
20
  # Example usage:
21
  temp_text = "win winger winning"
22
  entity = {"key": "win"} # Example dictionary (adjust as needed)
@@ -27,24 +32,24 @@ print(spans) # Output: [(0, 3)] (Only "win" at the beginning)
27
  temp_text = "The quick brown fox jumps over the lazy dog."
28
  entity = {"key": "fox"}
29
  spans = find_entity_spans(entity["key"], temp_text)
30
- print(spans) # Output: [(16, 19)]
31
 
32
  temp_text = "foxes fox foxing"
33
  entity = {"key": "fox"}
34
  spans = find_entity_spans(entity["key"], temp_text)
35
- print(spans) # Output: [(0, 3), (6, 9)]
36
 
37
  temp_text = "winger win winning"
38
  entity = {"key": "win"}
39
  spans = find_entity_spans(entity["key"], temp_text)
40
- print(spans) # Output: [(8, 11)]
41
 
42
  temp_text = "winger win winning"
43
  entity = {"key": "winger"}
44
  spans = find_entity_spans(entity["key"], temp_text)
45
- print(spans) # Output: [(0, 6)]
46
 
47
  temp_text = "winger win winning"
48
  entity = {"key": "winning"}
49
  spans = find_entity_spans(entity["key"], temp_text)
50
- print(spans) # Output: [(12, 19)]
 
1
  import re
2
 
3
+
4
  def find_entity_spans(entity, text):
5
  """
6
  Finds the start and end indices of whole word entities in text.
 
14
  of a found entity. Returns an empty list if no entities are found.
15
  """
16
  spans = []
17
+ for m in re.finditer(
18
+ r"\b" + re.escape(entity) + r"\b",
19
+ text,
20
+ ): # The crucial change
21
  spans.append((m.start(), m.end()))
22
  return spans
23
 
24
+
25
  # Example usage:
26
  temp_text = "win winger winning"
27
  entity = {"key": "win"} # Example dictionary (adjust as needed)
 
32
  temp_text = "The quick brown fox jumps over the lazy dog."
33
  entity = {"key": "fox"}
34
  spans = find_entity_spans(entity["key"], temp_text)
35
+ print(spans) # Output: [(16, 19)]
36
 
37
  temp_text = "foxes fox foxing"
38
  entity = {"key": "fox"}
39
  spans = find_entity_spans(entity["key"], temp_text)
40
+ print(spans) # Output: [(0, 3), (6, 9)]
41
 
42
  temp_text = "winger win winning"
43
  entity = {"key": "win"}
44
  spans = find_entity_spans(entity["key"], temp_text)
45
+ print(spans) # Output: [(8, 11)]
46
 
47
  temp_text = "winger win winning"
48
  entity = {"key": "winger"}
49
  spans = find_entity_spans(entity["key"], temp_text)
50
+ print(spans) # Output: [(0, 6)]
51
 
52
  temp_text = "winger win winning"
53
  entity = {"key": "winning"}
54
  spans = find_entity_spans(entity["key"], temp_text)
55
+ print(spans) # Output: [(12, 19)]