Spaces:

pmkhanh7890
/

news_verification

Running

App Files Files

pmkhanh7890 commited on 21 days ago

Commit

26e3944

1 Parent(s): 56cf7e3

add 2 more screen for ordinary and governor users

Browse files

Files changed (8) hide show

application_2.py +49 -12
examples/example_text_LLM_modification.txt +3 -1
src/application/content_detection.py +271 -93
src/application/text/entity.py +21 -18
src/application/text/helper.py +25 -0
src/application/text/preprocessing.py +1 -1
src/application/text/search_detection.py +23 -11
test.py +24 -43

application_2.py CHANGED Viewed

@@ -8,10 +8,6 @@ from src.application.content_detection import NewsVerification
 from src.application.url_reader import URLReader
 from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
-GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
-SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
 AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
 AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
@@ -52,7 +48,7 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
         # SETTINGS
         with gr.Column(scale=1):
@@ -88,15 +84,56 @@ with gr.Blocks() as demo:
                     news_content = gr.Textbox(label="Content", value="", lines=13)
         # NEWS ANALYSIS REPORT
-        explanation = """
         - Green texts are the matched words in the input and source news.<br>
         - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         with gr.Column(scale=2):
-            with gr.Accordion("News Analysis"):
-                gr.HTML(explanation)
-                detection_button = gr.Button("Verify news")
-                detailed_analysis = gr.HTML("<br>"*40)
     # Connect events
     load_button.click(
@@ -113,9 +150,9 @@ with gr.Blocks() as demo:
     generate_image_button.click(generate_fake_image,
                         inputs=[image_generation_model, news_title],
                         outputs=[news_image])
-    detection_button.click(generate_analysis_report,
                             inputs=[news_title, news_content, news_image],
-                            outputs=[detailed_analysis])
     # change Image
     #url_input.change(load_image, inputs=url_input, outputs=image_view)

 from src.application.url_reader import URLReader
 from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
 AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
 AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
         # SETTINGS
         with gr.Column(scale=1):
                     news_content = gr.Textbox(label="Content", value="", lines=13)
         # NEWS ANALYSIS REPORT
+        ordinary_user_explanation = """
+        FOR ORDINARY USER<br>
         - Green texts are the matched words in the input and source news.<br>
         - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
+        fact_checker_explanation = """
+        FOR FACT CHECKER<br>
+        - Green texts are the matched words in the input and source news.<br>
+        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
+        """
+        governor_explanation = """
+        FOR GOVERNOR<br>
+        - Green texts are the matched words in the input and source news.<br>
+        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
+        """
+        table = """
+        <h5>Comparison between input news and source news</h5>
+            <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
+            <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
+                <thead>
+                    <tr>
+                        <th>Input news</th>
+                        <th>Source (URL provided in Originality column correspondingly)</th>
+                        <th>Forensic</th>
+                        <th>Originality</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <th>Input 1</th>
+                        <th>Source 1(URL provided in Originality column correspondingly)</th>
+                        <th>Forensic 1</th>
+                        <th>Originality 1</th>
+                    </tr>
+                </tbody>
+            </table>
+            <style>"""
         with gr.Column(scale=2):
+            with gr.Accordion("NEWS ANALYSIS"):
+                verification_button = gr.Button("Verify news")
+                with gr.Tab("Orinary User"):
+                    gr.HTML(ordinary_user_explanation)
+                    ordinary_user_result = gr.HTML(table)
+                with gr.Tab("Fact Checker"):
+                    gr.HTML(fact_checker_explanation)
+                    fact_checker_result = gr.HTML("<br>"*40)
+                with gr.Tab("Governor"):
+                    gr.HTML(fact_checker_explanation)
+                    governor_result = gr.HTML(table)
     # Connect events
     load_button.click(
     generate_image_button.click(generate_fake_image,
                         inputs=[image_generation_model, news_title],
                         outputs=[news_image])
+    verification_button.click(generate_analysis_report,
                             inputs=[news_title, news_content, news_image],
+                            outputs=[ordinary_user_result, fact_checker_result, governor_result])
     # change Image
     #url_input.change(load_image, inputs=url_input, outputs=image_view)

examples/example_text_LLM_modification.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe. He made a substitute appearance and waved farewell to fans in Newcastle's recent win against Southampton. Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2022-23, and scored against Paris St-Germain in the Champions League.

+Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m.  Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
+He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
+Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.

src/application/content_detection.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from difflib import SequenceMatcher
-import difflib
-import string
 from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
 from src.application.text.entity import apply_highlight, highlight_entities
 from src.application.text.model_detection import detect_text_by_ai_model
-from src.application.text.preprocessing import split_into_sentences
 from src.application.text.search_detection import check_human, detect_text_by_relative_search
@@ -27,7 +26,10 @@ class NewsVerification():
         self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
         self.aligned_sentences:list[dict] = []
         self.is_paraphrased:list[bool] = []
-        self.analyzed_table:list[list] = []
     def load_news(self, news_title, news_content, news_image):
         self.news_text = news_title + "\n\n" + news_content
@@ -50,7 +52,7 @@ class NewsVerification():
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         # Classify by search engine
-        input_sentences = split_into_sentences(self.news_text)
         current_index = 0
         previous_paraphrase = None
         ai_sentence = {
@@ -87,7 +89,7 @@ class NewsVerification():
             else:
                 if previous_paraphrase is False or previous_paraphrase is None:
                     # add ai_sentences to align_sentences
-                    if ai_sentence["input_sentence"] != "":
                         text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
                         ai_sentence["label"] = text_prediction_label
                         ai_sentence["similarity"] = text_prediction_score
@@ -102,7 +104,7 @@ class NewsVerification():
                             "paraphrase": False,
                             "url": "",
                             }
                 # add searched_sentences to align_sentences
                 if searched_sentences["input_sentence"] != "":
                     self.found_img_url.extend(img_urls)
@@ -181,63 +183,11 @@ class NewsVerification():
         self.detect_image_origin()
     def analyze_details(self):
-        self.analyzed_table = []
-        for aligned_sentence in self.aligned_sentences:
-            if "input_sentence" not in aligned_sentence:
-                continue
-            # Get index of equal phrases in input and source sentences
-            equal_idx_1, equal_idx_2 = self.extract_equal_text(
-                    aligned_sentence["input_sentence"],
-                    aligned_sentence["matched_sentence"],
-                )
-            # Get entity-words (in pair) with colors
-            entities_with_colors = highlight_entities(
-                    aligned_sentence["input_sentence"],
-                    aligned_sentence["matched_sentence"],
-                )
-            self.analyzed_table.append(
-                [
-                    aligned_sentence["input_sentence"],
-                    aligned_sentence["matched_sentence"],
-                    equal_idx_1,
-                    equal_idx_2,
-                    entities_with_colors,
-                ]
-            )
-        if len(self.analyzed_table) != 0:
-            html_table = self.create_table()
-        else:
-            html_table = ""
-        return html_table
-    def extract_equal_text(self, text1, text2):
-        def cleanup(text):
-            text = text.lower()
-            text = text.translate(str.maketrans('', '', string.punctuation))
-            return text
-        splited_text1 = cleanup(text1).split()
-        splited_text2 = cleanup(text2).split()
-        s = SequenceMatcher(None, splited_text1, splited_text2)
-        equal_idx_1 = []
-        equal_idx_2 = []
-        text1 = text1.split()
-        text2 = text2.split()
-        for tag, i1, i2, j1, j2 in s.get_opcodes():
-            if tag == 'equal':
-                equal_idx_1.append({"start": i1, "end": i2})
-                equal_idx_2.append({"start": j1, "end": j2})
-                # subtext_1 = " ".join(text1[i1:i2])
-                # subtext_2 = " ".join(text2[j1:j2])
-                # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
-        return equal_idx_1, equal_idx_2
     def get_text_urls(self):
         return set(self.text_referent_url)
@@ -264,7 +214,7 @@ class NewsVerification():
         if not sentence_1 or not sentence_2:  # Handle empty strings
             return []
-        s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
         common_phrases = []
         for block in s.get_matching_blocks():
@@ -287,16 +237,40 @@ class NewsVerification():
         position += len(sentence_1)
         return common_phrases, position
-    def create_table(self):
-        #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
-        # loop of self.analyzed_table with index:
         rows = []
         max_length = 30  # TODO: put this in configuration
-        rows.append(self.format_image_row(max_length))
-        for index, row in enumerate(self.analyzed_table):
-            formatted_row = self.format_text_row(row, index, max_length)
             rows.append(formatted_row)
         table = "\n".join(rows)
         return f"""
         <h5>Comparison between input news and source news</h5>
@@ -316,29 +290,29 @@ class NewsVerification():
         <style>
     """
-    def format_text_row(self, row, index = 0, max_length=30):
-        if row[1] != "":  # source is not empty
             # highlight entities
-            input_sentence, highlight_idx_input = apply_highlight(row[0], row[4], "input")
-            source_sentence, highlight_idx_source = apply_highlight(row[1], row[4], "source")
-            print(f"highlighted_input: {input_sentence}")
             # Color overlapping words
-            input_sentence = self.color_text(input_sentence, row[2], highlight_idx_input)  # text, index of highlight words
-            source_sentence = self.color_text(source_sentence, row[3], highlight_idx_source)  # text, index of highlight words
-            print(f"input_sentence: {input_sentence}")
             input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
             source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
         else:
-            input_sentence = row[0]
-            source_sentence = row[1]
-        label = self.aligned_sentences[index]["label"]
-        score = self.aligned_sentences[index]["similarity"]
-        url = self.aligned_sentences[index]["url"] #
         short_url = self.shorten_url(url, max_length)
         source_text_url = f"""<a href="{url}">{short_url}</a>"""
@@ -351,8 +325,7 @@ class NewsVerification():
                 </tr>
                 """
-    def format_image_row(self, max_length=30):
-        # input_image = f"""<img src="example_image_input.jpg" width="200" height="150">"""
         if self.image_referent_url is not None or self.image_referent_url != "":
             source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
@@ -363,7 +336,216 @@ class NewsVerification():
             source_image_url = ""
         return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
     def shorten_url(self, url, max_length=30):
         if url is None:
             return ""
@@ -374,23 +556,19 @@ class NewsVerification():
             short_url = url
         return short_url
     def color_text(self, text, colored_idx, highlighted_idx):
         paragraph = ""
         words = text.split()
         starts, ends = self.extract_starts_ends(colored_idx)
         starts, ends = self.filter_indices(starts, ends, highlighted_idx)
-        print(f"highlighted_idx: {highlighted_idx}")
-        print(f"starts_2: {starts}")
-        print(f"ends_2: {ends}")
         previous_end = 0
         for start, end in zip(starts, ends):
             paragraph += " ".join(words[previous_end:start])
             equal_words = " ".join(words[start:end])
-            print(f"starts_2: {start}")
-            print(f"ends_2: {end}")
-            print(f"equal_words: {words[start:end]}")
             paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
             previous_end = end

 from difflib import SequenceMatcher
 from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
 from src.application.text.entity import apply_highlight, highlight_entities
+from src.application.text.helper import extract_equal_text
 from src.application.text.model_detection import detect_text_by_ai_model
+from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search_detection import check_human, detect_text_by_relative_search
         self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
         self.aligned_sentences:list[dict] = []
         self.is_paraphrased:list[bool] = []
+        self.ordinary_user_table:list = []
+        self.fact_checker_table:list = []
+        self.governor_table:list = []
     def load_news(self, news_title, news_content, news_image):
         self.news_text = news_title + "\n\n" + news_content
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         # Classify by search engine
+        input_sentences = split_into_paragraphs(self.news_text)
         current_index = 0
         previous_paraphrase = None
         ai_sentence = {
             else:
                 if previous_paraphrase is False or previous_paraphrase is None:
                     # add ai_sentences to align_sentences
+                    if ai_sentence["input_sentence"] != "" or current_index >= len(input_sentences):
                         text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
                         ai_sentence["label"] = text_prediction_label
                         ai_sentence["similarity"] = text_prediction_score
                             "paraphrase": False,
                             "url": "",
                             }
                 # add searched_sentences to align_sentences
                 if searched_sentences["input_sentence"] != "":
                     self.found_img_url.extend(img_urls)
         self.detect_image_origin()
     def analyze_details(self):
+        ordinary_user_table = self.create_ordinary_user_table()
+        fact_checker_table = self.create_fact_checker_table()
+        governor_table = self.create_governor_table()
+        return ordinary_user_table, fact_checker_table, governor_table
     def get_text_urls(self):
         return set(self.text_referent_url)
         if not sentence_1 or not sentence_2:  # Handle empty strings
             return []
+        s = SequenceMatcher(None, sentence_1, sentence_2)
         common_phrases = []
         for block in s.get_matching_blocks():
         position += len(sentence_1)
         return common_phrases, position
+    def create_fact_checker_table(self):
         rows = []
         max_length = 30  # TODO: put this in configuration
+        rows.append(self.format_image_fact_checker_row(max_length))
+        for aligned_sentence in self.aligned_sentences:
+            if "input_sentence" not in aligned_sentence:
+                continue
+            # Get index of equal phrases in input and source sentences
+            equal_idx_1, equal_idx_2 = extract_equal_text(
+                    aligned_sentence["input_sentence"],
+                    aligned_sentence["matched_sentence"],
+                )
+            # Get entity-words (in pair) with colors
+            entities_with_colors = highlight_entities(
+                    aligned_sentence["input_sentence"],
+                    aligned_sentence["matched_sentence"],
+                )
+            self.fact_checker_table.append(
+                [
+                    aligned_sentence,
+                    equal_idx_1,
+                    equal_idx_2,
+                    entities_with_colors,
+                ]
+            )
+        for row in self.fact_checker_table:
+            formatted_row = self.format_text_fact_checker_row(row, max_length)
             rows.append(formatted_row)
         table = "\n".join(rows)
         return f"""
         <h5>Comparison between input news and source news</h5>
         <style>
     """
+    def format_text_fact_checker_row(self, row, max_length=30):
+        if row[0]["input_sentence"] == "":
+            return ""
+        if row[0]["matched_sentence"] != "":  # source is not empty
             # highlight entities
+            input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
+            source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
             # Color overlapping words
+            input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input)  # text, index of highlight words
+            source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source)  # text, index of highlight words
             input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
             source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
         else:
+            input_sentence = row[0]["input_sentence"]
+            source_sentence = row[0]["matched_sentence"]
+        label = row[0]["label"]
+        score = row[0]["similarity"]
+        url = row[0]["url"] #
         short_url = self.shorten_url(url, max_length)
         source_text_url = f"""<a href="{url}">{short_url}</a>"""
                 </tr>
                 """
+    def format_image_fact_checker_row(self, max_length=30):
         if self.image_referent_url is not None or self.image_referent_url != "":
             source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
             source_image_url = ""
         return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
+    def create_ordinary_user_table(self):
+        rows = []
+        max_length = 30  # TODO: put this in configuration
+        rows.append(self.format_image_ordinary_user_row(max_length))
+        rows.append(self.format_text_ordinary_user_row(max_length))
+        table = "\n".join(rows)
+        return f"""
+        <h5>Comparison between input news and source news</h5>
+        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
+            <thead>
+                <tr>
+                    <th>Input news</th>
+                    <th>Forensic</th>
+                    <th>Originality</th>
+                </tr>
+            </thead>
+            <tbody>
+                {table}
+            </tbody>
+        </table>
+        <style>
+    """
+    def format_text_ordinary_user_row(self, max_length=30):
+        input_sentences = ""
+        source_text_urls = ""
+        label = ""
+        scores = 0
+        sentence_count = 0
+        for index, row in enumerate(self.aligned_sentences):
+            if row["input_sentence"] == "":
+                continue
+            input_sentences += row["input_sentence"]
+            label = self.aligned_sentences[index]["label"]
+            if label == "HUMAN":
+                score = self.aligned_sentences[index]["similarity"]
+            if label == "MACHINE":
+                score = 1 - self.aligned_sentences[index]["similarity"]
+            scores += score
+            url = self.aligned_sentences[index]["url"] #
+            short_url = self.shorten_url(url, max_length)
+            source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
+            sentence_count += 1
+        if scores == 0:
+            label = "UNKNOWN"
+        else:
+            scores /= sentence_count
+            if scores > 0.5:
+                label = "HUMAN"
+            else:
+                label = "MACHINE"
+                scores = 1 - scores
+        return f"""
+                <tr>
+                    <td>{input_sentences}</td>
+                    <td>{label}<br>({scores*100:.2f}%)</td>
+                    <td>{source_text_urls}</td>
+                </tr>
+                """
+    def format_image_ordinary_user_row(self, max_length=30):
+        if self.image_referent_url is not None or self.image_referent_url != "":
+            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
+            short_url = self.shorten_url(self.image_referent_url, max_length)
+            source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
+        else:
+            source_image = "Image not found"
+            source_image_url = ""
+        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
+    def create_governor_table(self):
+        rows = []
+        max_length = 30  # TODO: put this in configuration
+        rows.append(self.format_image_governor_row(max_length))
+        for aligned_sentence in self.aligned_sentences:
+            if "input_sentence" not in aligned_sentence:
+                continue
+            # Get index of equal phrases in input and source sentences
+            equal_idx_1, equal_idx_2 = extract_equal_text(
+                    aligned_sentence["input_sentence"],
+                    aligned_sentence["matched_sentence"],
+                )
+            # Get entity-words (in pair) with colors
+            entities_with_colors = highlight_entities(
+                    aligned_sentence["input_sentence"],
+                    aligned_sentence["matched_sentence"],
+                )
+            self.governor_table.append(
+                [
+                    aligned_sentence,
+                    equal_idx_1,
+                    equal_idx_2,
+                    entities_with_colors,
+                ]
+            )
+        formatted_row = self.format_text_governor_row(max_length)
+        rows.append(formatted_row)
+        table = "\n".join(rows)
+        return f"""
+            <h5>Comparison between input news and source news</h5>
+            <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
+            <col style="width: 150px;">  <col style="width: 150px;">  <col style="width: 50px;"> <col style="width: 75px;">
+                <thead>
+                    <tr>
+                        <th>Input news</th>
+                        <th>Source (URL provided in Originality column correspondingly)</th>
+                        <th>Forensic</th>
+                        <th>Originality</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {table}
+                </tbody>
+            </table>
+            <style>
+        """
+    def format_text_governor_row(self,  max_length=30):
+        input_sentences = ""
+        source_sentences = ""
+        source_text_urls = ""
+        label = ""
+        scores = 0
+        sentence_count = 0
+        entity_count = 0
+        for row in self.governor_table:
+            print(f"governor_row: {row}")
+            if row[0]["input_sentence"] == "":
+                continue
+            if row[0]["matched_sentence"] != "":  # source is not empty
+                # highlight entities
+                input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input", entity_count)
+                source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source", entity_count)
+                entity_count += len(row[3])
+                # Color overlapping words
+                input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input)  # text, index of highlight words
+                source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source)  # text, index of highlight words
+                input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
+                source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
+            else:
+                input_sentence = row[0]["input_sentence"]
+                source_sentence = row[0]["matched_sentence"]
+            input_sentences += input_sentence
+            source_sentences += source_sentence
+            score = row[0]["similarity"]
+            label = row[0]["label"]
+            if label == "HUMAN":
+                score = row[0]["similarity"]
+            if label == "MACHINE":
+                score = 1 - row[0]["similarity"]
+            scores += score
+            url = row[0]["url"]
+            short_url = self.shorten_url(url, max_length)
+            source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
+            sentence_count += 1
+        if scores == 0:
+            label = "UNKNOWN"
+        else:
+            scores /= sentence_count
+            if scores > 0.5:
+                label = "HUMAN"
+            else:
+                label = "MACHINE"
+                scores = 1 - scores
+        return f"""
+                <tr>
+                    <td>{input_sentences}</td>
+                    <td>{source_sentences}</td>
+                    <td>{label}<br>({score*100:.2f}%)</td>
+                    <td>{source_text_urls}</td>
+                </tr>
+                """
+    def format_image_governor_row(self, max_length=30):
+        if self.image_referent_url is not None or self.image_referent_url != "":
+            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
+            short_url = self.shorten_url(self.image_referent_url, max_length)
+            source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
+        else:
+            source_image = "Image not found"
+            source_image_url = ""
+        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
     def shorten_url(self, url, max_length=30):
         if url is None:
             return ""
             short_url = url
         return short_url
     def color_text(self, text, colored_idx, highlighted_idx):
         paragraph = ""
         words = text.split()
         starts, ends = self.extract_starts_ends(colored_idx)
         starts, ends = self.filter_indices(starts, ends, highlighted_idx)
         previous_end = 0
         for start, end in zip(starts, ends):
             paragraph += " ".join(words[previous_end:start])
             equal_words = " ".join(words[start:end])
             paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
             previous_end = end

src/application/text/entity.py CHANGED Viewed

@@ -15,27 +15,28 @@ AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
 AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
 client = openai.AzureOpenAI(
-    api_version = AZURE_OPENAI_API_VERSION,
     api_key = AZURE_OPENAI_API_KEY,
     azure_endpoint = AZURE_OPENAI_ENDPOINT,
     )
-def extract_entities_gpt(original_text, compared_text, text_generation_model="gpt-4o-mini"):
-    # "o1-mini-2024-09-12"
     # Generate text using the selected models
     prompt = f"""
 Compare the ORIGINAL TEXT and the COMPARED TEXT.
 Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
 Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
 * **Numerical changes:**  e.g., "five" changed to "ten," "10%" changed to "50%"
 * **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
 * **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
 * **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
 Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
-Output the extracted entity pairs, one pair per line, in the following JSON-like list format:
 [
     ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
     ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
@@ -55,7 +56,7 @@ If there are no entities that satisfy above condition, output empty list "[]".
     try:
         response = client.chat.completions.create(
             model=text_generation_model,
-            messages = [{"role": "system", "content": prompt}],
         )
         res = response.choices[0].message.content
@@ -69,7 +70,13 @@ If there are no entities that satisfy above condition, output empty list "[]".
 def read_json(json_string) -> list[list[str]]:
     try:
         entities = json.loads(json_string)
-        return entities
     except json.JSONDecodeError as e:
         print(f"Error decoding JSON: {e}")
@@ -132,26 +139,22 @@ def assign_colors_to_entities(entities):
 def highlight_entities(text1, text2):
     if text1 == "" or text2 == "":
         return []
-    print(f"text1: {text1}")
-    print(f"text2: {text2}")
     entities_text = extract_entities_gpt(text1, text2)
     print(f"entities_text: {entities_text}")
     entities = read_json(entities_text)
     # Assign colors to entities
     entities_with_colors = assign_colors_to_entities(entities)
-    print(f"entities_colors: ", entities_with_colors)
-    # Apply highlighting to entities
-    # highlighted_text_1 = apply_highlight(text1, entities_with_colors, "input")
-    # highlighted_text_2 = apply_highlight(text2, entities_with_colors, "source")
     return entities_with_colors
-def apply_highlight(text, entities_with_colors, key="input"):
     if entities_with_colors == []:
         return text, []
@@ -182,7 +185,7 @@ def apply_highlight(text, entities_with_colors, key="input"):
             # Style the index as a label
             index_label = (f'<span_style="background-color:{label_color};color:white;'
                             f'padding:1px_4px;border-radius:4px;font-size:12px;'
-                            f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1}</span>')
             # Append highlighted text with index label
             highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'

 AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
 client = openai.AzureOpenAI(
+    api_version = "2024-05-01-preview",  # AZURE_OPENAI_API_VERSION,
     api_key = AZURE_OPENAI_API_KEY,
     azure_endpoint = AZURE_OPENAI_ENDPOINT,
     )
+def extract_entities_gpt(original_text, compared_text, text_generation_model="o1-mini"):
+    # "gpt-4o-mini" or "o1-mini"
     # Generate text using the selected models
     prompt = f"""
 Compare the ORIGINAL TEXT and the COMPARED TEXT.
 Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
 Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
 * **Numerical changes:**  e.g., "five" changed to "ten," "10%" changed to "50%"
+* **Time changes:**  e.g., "Monday" changed to "Sunday," "10th" changed to "21st"
 * **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
 * **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
 * **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
 Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
+Output the extracted entity pairs, one pair per line, in the following JSON-like list format without wrapping characters:
 [
     ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
     ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
     try:
         response = client.chat.completions.create(
             model=text_generation_model,
+            messages = [{"role": "user", "content": prompt}],
         )
         res = response.choices[0].message.content
 def read_json(json_string) -> list[list[str]]:
     try:
         entities = json.loads(json_string)
+        # Remove duplicates pair of entities
+        unique_entities = []
+        for inner_list in entities:
+            if inner_list not in unique_entities:
+                unique_entities.append(inner_list)
+        return unique_entities
     except json.JSONDecodeError as e:
         print(f"Error decoding JSON: {e}")
 def highlight_entities(text1, text2):
     if text1 == "" or text2 == "":
         return []
     entities_text = extract_entities_gpt(text1, text2)
     print(f"entities_text: {entities_text}")
+    # Clean up entities: remove wrapping characters
+    entities_text = entities_text.replace("```json", "").replace("```", "")
     entities = read_json(entities_text)
     # Assign colors to entities
     entities_with_colors = assign_colors_to_entities(entities)
     return entities_with_colors
+def apply_highlight(text, entities_with_colors, key="input", count = 0):
     if entities_with_colors == []:
         return text, []
             # Style the index as a label
             index_label = (f'<span_style="background-color:{label_color};color:white;'
                             f'padding:1px_4px;border-radius:4px;font-size:12px;'
+                            f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>')
             # Append highlighted text with index label
             highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'

src/application/text/helper.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from collections import Counter
 import re
 import string
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -144,6 +145,30 @@ def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length
     return important_phrases
 def connect_consecutive_indexes(nums):
     """
     Connects consecutive integers in a list.

 from collections import Counter
+from difflib import SequenceMatcher
 import re
 import string
 from sklearn.feature_extraction.text import TfidfVectorizer
     return important_phrases
+def extract_equal_text(text1, text2):
+    def cleanup(text):
+        text = text.lower()
+        text = text.translate(str.maketrans('', '', string.punctuation))
+        return text
+    splited_text1 = cleanup(text1).split()
+    splited_text2 = cleanup(text2).split()
+    s = SequenceMatcher(None, splited_text1, splited_text2)
+    equal_idx_1 = []
+    equal_idx_2 = []
+    text1 = text1.split()
+    text2 = text2.split()
+    for tag, i1, i2, j1, j2 in s.get_opcodes():
+        if tag == 'equal':
+            equal_idx_1.append({"start": i1, "end": i2})
+            equal_idx_2.append({"start": j1, "end": j2})
+            # subtext_1 = " ".join(text1[i1:i2])
+            # subtext_2 = " ".join(text2[j1:j2])
+            # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
+    return equal_idx_1, equal_idx_2
 def connect_consecutive_indexes(nums):
     """
     Connects consecutive integers in a list.

src/application/text/preprocessing.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from nltk.tokenize import sent_tokenize
-def split_into_sentences(input_text):
     """
     Splits input text into sentences by newlines.

 from nltk.tokenize import sent_tokenize
+def split_into_paragraphs(input_text):
     """
     Splits input text into sentences by newlines.

src/application/text/search_detection.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
-from src.application.text.preprocessing import split_into_sentences
 from src.application.text.search import generate_search_phrases, search_by_google
 from src.application.url_reader import URLReader
 import numpy as np
 import nltk
 import torch
@@ -41,7 +43,7 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
         search_results = search_by_google(candidate)
         urls = [item['link'] for item in search_results.get("items", [])]
-        for url in urls[:10]:
             if url in checked_urls: # visited url
                 continue
             if "bbc.com" not in url:
@@ -196,14 +198,13 @@ def check_paraphrase(input_text, page_text, url):
         return False, []
     # Extract sentences from input text and web page
-    #input_text = remove_punctuation(input_text)
-    input_sentences = split_into_sentences(input_text)
     if not page_text:
         return is_paraphrase_text, []
-    #page_text = remove_punctuation(page_text)
-    page_sentences = split_into_sentences(page_text)
     if not input_sentences or not page_sentences:
         return is_paraphrase_text, []
@@ -213,8 +214,6 @@ def check_paraphrase(input_text, page_text, url):
             additional_sentences.append(sentence.replace(", external", ""))
     page_sentences.extend(additional_sentences)
-    # min_matching_sentences = math.ceil(len(input_sentences) * MIN_RATIO_PARAPHRASE_NUM)
     # Encode sentences into embeddings
     embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
     embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
@@ -225,10 +224,12 @@ def check_paraphrase(input_text, page_text, url):
     # Find sentence alignments
     alignment = {}
     paraphrased_sentence_count = 0
     for i, sentence1 in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
         is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
         if is_paraphrase_sentence is False:
@@ -262,9 +263,20 @@ def check_paraphrase(input_text, page_text, url):
     # Check if enough sentences are paraphrases
     is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
     return is_paraphrase_text, alignment
 def similarity_ratio(a, b):
     """
     Calculates the similarity ratio between two strings using SequenceMatcher.

+import string
 import warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
+from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search import generate_search_phrases, search_by_google
 from src.application.url_reader import URLReader
+from src.application.text.helper import extract_equal_text
 import numpy as np
 import nltk
 import torch
         search_results = search_by_google(candidate)
         urls = [item['link'] for item in search_results.get("items", [])]
+        for url in urls[:3]:
             if url in checked_urls: # visited url
                 continue
             if "bbc.com" not in url:
         return False, []
     # Extract sentences from input text and web page
+    # input_sentences = split_into_paragraphs(input_text)
+    input_sentences = [input_text]
     if not page_text:
         return is_paraphrase_text, []
+    page_sentences = split_into_paragraphs(page_text)
     if not input_sentences or not page_sentences:
         return is_paraphrase_text, []
             additional_sentences.append(sentence.replace(", external", ""))
     page_sentences.extend(additional_sentences)
     # Encode sentences into embeddings
     embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
     embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
     # Find sentence alignments
     alignment = {}
     paraphrased_sentence_count = 0
+    best_matched_sentence = ""
     for i, sentence1 in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
+        best_matched_sentence = page_sentences[max_sim_index]
         is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
         if is_paraphrase_sentence is False:
     # Check if enough sentences are paraphrases
     is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
+    # Method 2: Check if overlapped words between sentences are more than 50%
+    equal_idx_1, _ = extract_equal_text(input_sentences[0], best_matched_sentence)
+    matched_count = 0
+    for index in equal_idx_1:
+        matched_count += index["end"] - index["start"]
+    sent = input_sentences[0].translate(str.maketrans('', '', string.punctuation))
+    num_words = len(sent.split())
+    if matched_count > num_words / 2:
+        is_paraphrase_text = True
     return is_paraphrase_text, alignment
 def similarity_ratio(a, b):
     """
     Calculates the similarity ratio between two strings using SequenceMatcher.

test.py CHANGED Viewed

@@ -1,46 +1,27 @@
-from difflib import SequenceMatcher
-import string
-def extract_equal_text(text1, text2):
-    def cleanup(text):
-        text = text.lower()
-        text = text.translate(str.maketrans('', '', string.punctuation))
-        return text
-    splited_text1 = cleanup(text1).split()
-    splited_text2 = cleanup(text2).split()
-    s = SequenceMatcher(None, splited_text1, splited_text2)
-    equal_idx_1 = []
-    equal_idx_2 = []
-    text1 = text1.split()
-    text2 = text2.split()
-    for tag, i1, i2, j1, j2 in s.get_opcodes():
-        if tag == 'equal':
-            equal_idx_1.append({"start": i1, "end": i2})
-            equal_idx_2.append({"start": j1, "end": j2})
-            subtext_1 = " ".join(text1[i1:i2])
-            subtext_2 = " ".join(text2[j1:j2])
-            print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] {subtext_1!r:>55} --> {subtext_2!r}')
-    return equal_idx_1, equal_idx_2
-text1 = """
-Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m.
-Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
-He made a substitute appearance and waved farewell to fans in Newcastle's recent win against Southampton.
-Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2022-23, and scored against Paris St-Germain in the Champions League.
 """
-text2 = """
-Newcastle United winger Miguel Almiron has rejoined Atlanta United on a permanent deal for £8m.
-Almiron has made 223 appearances for Newcastle, scoring 30 goals, but has struggled recently to gain a place in manager Eddie Howe's starting line-up.
-Last weekend he came on as a substitute in Newcastle's 3-1 win against Southampton and waved farewell to the travelling supporters.
-Almiron played a significant role in Newcastle reaching the Carabao Cup final and finishing fourth in the Premier League in 2022-23.
-"""
-idx_1, idx_2 = extract_equal_text(text1, text2)
-# text1_split = text1.split()
-# for idx in idx_1:
-#     print(text1_split[idx["start"]:idx["end"]])

+import json
+text = """```json
+[
+    ["Sunday", "Thursday"],
+    ["two millions", "one million"],
+    ["north", "east"],
+    ["Japan", "UK"],
+    ["Sunday", "Thursday"]
+]
+```
 """
+def read_json(json_string) -> list[list[str]]:
+    try:
+        entities = json.loads(json_string)
+        # Remove duplicates pair of entities
+        unique_data = []
+        for inner_list in entities:
+            if inner_list not in unique_data:
+                unique_data.append(inner_list)
+        return unique_data
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON: {e}")
+        return []
+print(read_json(text.replace("```json", "").replace("```", "")))