Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

aliasgerovs commited on May 24, 2024

Commit

dd9b08a

1 Parent(s): ca39c04

Added latest updates related to higlighter fix

Browse files

Files changed (3) hide show

app.py +3 -3
highlighter.py +19 -11
predictors.py +47 -116

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from predictors import update,update_main, correct_text, split_text
 from analysis import depth_analysis
 from predictors import predict_quillbot
 from plagiarism import plagiarism_check, build_date, html_highlight
-from highlighter import analyze_and_highlight
 from utils import extract_text_from_pdf, len_validator
 import yaml
 from functools import partial
@@ -20,9 +20,9 @@ with open("config.yaml", "r") as file:
 model_list = params["MC_OUTPUT_LABELS"]
-analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
 analyze_and_highlight_quillbot = partial(
-    analyze_and_highlight, model_type="quillbot"
 )

 from analysis import depth_analysis
 from predictors import predict_quillbot
 from plagiarism import plagiarism_check, build_date, html_highlight
+from highlighter import segmented_higlighter
 from utils import extract_text_from_pdf, len_validator
 import yaml
 from functools import partial
 model_list = params["MC_OUTPUT_LABELS"]
+analyze_and_highlight_bc = partial(segmented_higlighter, model_type="bc")
 analyze_and_highlight_quillbot = partial(
+    segmented_higlighter, model_type="quillbot"
 )

highlighter.py CHANGED Viewed

@@ -2,7 +2,7 @@ from lime.lime_text import LimeTextExplainer
 from nltk.tokenize import sent_tokenize
 from predictors import predict_for_explainanility
 from predictors import update, correct_text, split_text
 def explainer(text, model_type):
     def predictor_wrapper(text):
@@ -15,7 +15,7 @@ def explainer(text, model_type):
     sentences = [sent for sent in sent_tokenize(text)]
     num_sentences = len(sentences)
     exp = explainer_.explain_instance(
-        text, predictor_wrapper, num_features=num_sentences, num_samples=2000
     )
     weights_mapping = exp.as_map()[1]
     sentences_weights = {sentence: 0 for sentence in sentences}
@@ -23,15 +23,12 @@ def explainer(text, model_type):
         if 0 <= idx < len(sentences):
             sentences_weights[sentences[idx]] = weight
     print(sentences_weights, model_type)
-    return sentences_weights, exp
 def analyze_and_highlight(text, bias_buster_selected,  model_type):
-    if bias_buster_selected:
-        text = update(text)
     highlighted_text = ""
-    sentences_weights, _ = explainer(text, model_type)
     positive_weights = [weight for weight in sentences_weights.values() if weight >= 0]
     negative_weights = [weight for weight in sentences_weights.values() if weight < 0]
@@ -44,7 +41,8 @@ def analyze_and_highlight(text, bias_buster_selected,  model_type):
     max_positive_weight += smoothing_factor
     min_negative_weight -= smoothing_factor
-    for sentence, weight in sentences_weights.items():
         sentence = sentence.strip()
         if not sentence:
             continue
@@ -67,6 +65,17 @@ def analyze_and_highlight(text, bias_buster_selected,  model_type):
         )
         highlighted_text += highlighted_sentence
     if model_type == "bc":
         gradient_labels = ["HUMAN", "AI"]
     elif model_type == "quillbot":
@@ -76,7 +85,7 @@ def analyze_and_highlight(text, bias_buster_selected,  model_type):
     highlighted_text = (
         "<div>"
-        + highlighted_text
         + "<div style='margin-top: 20px; text-align: center;'>"
         + "<div style='position: relative; display: inline-block; width: 60%; height: 20px; background: linear-gradient(to right, #00FF00, #FFFFFF, #FF0000); font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif; font-size: 10px; font-weight: 600; color: #222; border-radius: 10px; box-shadow: 0px 2px 5px rgba(0, 0, 0, 0.1);'>"
         + f"<span style='position: absolute; left: 5px; top: 50%; transform: translateY(-50%); color: #000; font-weight: 600;'>{gradient_labels[0]}</span>"
@@ -85,5 +94,4 @@ def analyze_and_highlight(text, bias_buster_selected,  model_type):
         + "</div>"
         + "</div>"
     )
-    return highlighted_text

 from nltk.tokenize import sent_tokenize
 from predictors import predict_for_explainanility
 from predictors import update, correct_text, split_text
+from predictors import split_text_allow_complete_sentences_nltk, get_token_length
 def explainer(text, model_type):
     def predictor_wrapper(text):
     sentences = [sent for sent in sent_tokenize(text)]
     num_sentences = len(sentences)
     exp = explainer_.explain_instance(
+        text, predictor_wrapper, num_features=num_sentences, num_samples=100
     )
     weights_mapping = exp.as_map()[1]
     sentences_weights = {sentence: 0 for sentence in sentences}
         if 0 <= idx < len(sentences):
             sentences_weights[sentences[idx]] = weight
     print(sentences_weights, model_type)
+    return sentences_weights, sentences, exp
 def analyze_and_highlight(text, bias_buster_selected,  model_type):
     highlighted_text = ""
+    sentences_weights, sentences, _ = explainer(text, model_type)
     positive_weights = [weight for weight in sentences_weights.values() if weight >= 0]
     negative_weights = [weight for weight in sentences_weights.values() if weight < 0]
     max_positive_weight += smoothing_factor
     min_negative_weight -= smoothing_factor
+    for sentence in sentences:
+        weight = sentences_weights[sentence]
         sentence = sentence.strip()
         if not sentence:
             continue
         )
         highlighted_text += highlighted_sentence
+    return highlighted_text
+def segmented_higlighter(text, bias_buster_selected, model_type):
+    if bias_buster_selected:
+        text = update(text)
+    result = ""
+    segmented_results = split_text_allow_complete_sentences_nltk(text)
+    for segment in segmented_results:
+        chunk = analyze_and_highlight(segment, model_type)
+        result = result + " " + chunk
+    print(result)
     if model_type == "bc":
         gradient_labels = ["HUMAN", "AI"]
     elif model_type == "quillbot":
     highlighted_text = (
         "<div>"
+        + result
         + "<div style='margin-top: 20px; text-align: center;'>"
         + "<div style='position: relative; display: inline-block; width: 60%; height: 20px; background: linear-gradient(to right, #00FF00, #FFFFFF, #FF0000); font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif; font-size: 10px; font-weight: 600; color: #222; border-radius: 10px; box-shadow: 0px 2px 5px rgba(0, 0, 0, 0.1);'>"
         + f"<span style='position: absolute; left: 5px; top: 50%; transform: translateY(-50%); color: #000; font-weight: 600;'>{gradient_labels[0]}</span>"
         + "</div>"
         + "</div>"
     )
+    return highlighted_text

predictors.py CHANGED Viewed

@@ -24,7 +24,6 @@ with open("config.yaml", "r") as file:
 nltk.download("punkt")
 nltk.download("stopwords")
 device_needed = "cuda" if torch.cuda.is_available() else "cpu"
-device = 'cpu'
 text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
 text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
 text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
@@ -50,12 +49,12 @@ quillbot_model = AutoModelForSequenceClassification.from_pretrained(
 # proxy models for explainability
-mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
 bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
 bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
     mini_bc_model_name
 ).to(device_needed)
-mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
 humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
     mini_humanizer_model_name
 )
@@ -119,83 +118,58 @@ def update_main(text: str):
     corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
     return corrected_text, corrections_display
-def split_text_allow_complete_sentences_nltk(
-    text,
-    max_length=256,
-    tolerance=30,
-    min_last_segment_length=100,
-    type_det="bc",
-):
-    sentences = nltk.sent_tokenize(text)
-    segments = []
-    current_segment = []
     current_length = 0
     if type_det == "bc":
         tokenizer = text_bc_tokenizer
-        max_length = bc_token_size
     elif type_det == "mc":
         tokenizer = text_mc_tokenizer
-        max_length = mc_token_size
-    for sentence in sentences:
-        tokens = tokenizer.tokenize(sentence)
-        sentence_length = len(tokens)
-        if current_length + sentence_length <= max_length + tolerance - 2:
-            current_segment.append(sentence)
-            current_length += sentence_length
-        else:
-            if current_segment:
-                encoded_segment = tokenizer.encode(
-                    " ".join(current_segment),
-                    add_special_tokens=True,
-                    max_length=max_length + tolerance,
-                    truncation=True,
-                )
-                segments.append((current_segment, len(encoded_segment)))
-            current_segment = [sentence]
-            current_length = sentence_length
-    if current_segment:
-        encoded_segment = tokenizer.encode(
-            " ".join(current_segment),
-            add_special_tokens=True,
-            max_length=max_length + tolerance,
-            truncation=True,
-        )
-        segments.append((current_segment, len(encoded_segment)))
-    final_segments = []
-    for i, (seg, length) in enumerate(segments):
-        if i == len(segments) - 1:
-            if length < min_last_segment_length and len(final_segments) > 0:
-                prev_seg, prev_length = final_segments[-1]
-                combined_encoded = tokenizer.encode(
-                    " ".join(prev_seg + seg),
-                    add_special_tokens=True,
-                    max_length=max_length + tolerance,
-                    truncation=True,
-                )
-                if len(combined_encoded) <= max_length + tolerance:
-                    final_segments[-1] = (prev_seg + seg, len(combined_encoded))
-                else:
-                    final_segments.append((seg, length))
             else:
-                final_segments.append((seg, length))
         else:
-            final_segments.append((seg, length))
-    decoded_segments = []
-    encoded_segments = []
-    for seg, _ in final_segments:
-        encoded_segment = tokenizer.encode(
-            " ".join(seg),
-            add_special_tokens=True,
-            max_length=max_length + tolerance,
-            truncation=True,
-        )
-        decoded_segment = tokenizer.decode(encoded_segment)
-        decoded_segments.append(decoded_segment)
-    return decoded_segments
 def predict_quillbot(text, bias_buster_selected):
@@ -227,7 +201,7 @@ def predict_for_explainanility(text, model_type=None):
         tokenizer = humanizer_tokenizer_mini
     elif model_type == "bc":
         cleaning = True
-        max_length = 512
         model = bc_model_mini
         tokenizer = bc_tokenizer_mini
     else:
@@ -278,46 +252,6 @@ def predict_mc(model, tokenizer, text):
         return output_norm
-def predict_mc_scores(input):
-    bc_scores = []
-    mc_scores = []
-    samples_len_bc = len(
-        split_text_allow_complete_sentences_nltk(input, type_det="bc")
-    )
-    segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
-    for i in range(samples_len_bc):
-        cleaned_text_bc = remove_special_characters(segments_bc[i])
-        bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
-        bc_scores.append(bc_score)
-    bc_scores_array = np.array(bc_scores)
-    average_bc_scores = np.mean(bc_scores_array, axis=0)
-    bc_score_list = average_bc_scores.tolist()
-    bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
-    segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
-    samples_len_mc = len(
-        split_text_allow_complete_sentences_nltk(input, type_det="mc")
-    )
-    for i in range(samples_len_mc):
-        cleaned_text_mc = remove_special_characters(segments_mc[i])
-        mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
-        mc_scores.append(mc_score)
-    mc_scores_array = np.array(mc_scores)
-    average_mc_scores = np.mean(mc_scores_array, axis=0)
-    mc_score_list = average_mc_scores.tolist()
-    mc_score = {}
-    for score, label in zip(mc_score_list, mc_label_map):
-        mc_score[label.upper()] = score
-    sum_prob = 1 - bc_score["HUMAN"]
-    for key, value in mc_score.items():
-        mc_score[key] = value * sum_prob
-    if sum_prob < 0.01:
-        mc_score = {}
-    return mc_score
 def predict_bc_scores(input):
     bc_scores = []
     samples_len_bc = len(
@@ -385,9 +319,6 @@ def predict_mc_scores(input):
     for score, label in zip(mc_score_list, mc_label_map):
         mc_score[label.upper()] = score
-    total = sum(mc_score.values())
-    # Normalize each value by dividing it by the total
-    mc_score = {key: value / total for key, value in mc_score.items()}
     sum_prob = 1 - bc_score["HUMAN"]
     for key, value in mc_score.items():
         mc_score[key] = value * sum_prob

 nltk.download("punkt")
 nltk.download("stopwords")
 device_needed = "cuda" if torch.cuda.is_available() else "cpu"
 text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
 text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
 text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
 # proxy models for explainability
+mini_bc_model_name = "polygraf-ai/bc-model"
 bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
 bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
     mini_bc_model_name
 ).to(device_needed)
+mini_humanizer_model_name =  "polygraf-ai/humanizer-model"
 humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
     mini_humanizer_model_name
 )
     corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
     return corrected_text, corrections_display
+def split_text(text: str) -> list:
+    sentences = sent_tokenize(text)
+    return [[sentence] for sentence in sentences]
+def get_token_length(tokenizer, sentence):
+    return len(tokenizer.tokenize(sentence))
+def split_text_allow_complete_sentences_nltk(text, type_det="bc"):
+    sentences = sent_tokenize(text)
+    chunks = []
+    current_chunk = []
     current_length = 0
     if type_det == "bc":
         tokenizer = text_bc_tokenizer
+        max_tokens = bc_token_size
     elif type_det == "mc":
         tokenizer = text_mc_tokenizer
+        max_tokens = mc_token_size
+    elif type_det == "quillbot":
+        tokenizer = quillbot_tokenizer
+        max_tokens = 256
+    def add_sentence_to_chunk(sentence):
+        nonlocal current_chunk, current_length
+        sentence_length = get_token_length(tokenizer, sentence)
+        if current_length + sentence_length > max_tokens:
+            chunks.append((current_chunk, current_length))
+            current_chunk = []
+            current_length = 0
+        current_chunk.append(sentence)
+        current_length += sentence_length
+    for sentence in sentences:
+        add_sentence_to_chunk(sentence)
+    if current_chunk:
+        chunks.append((current_chunk, current_length))
+    adjusted_chunks = []
+    while chunks:
+        chunk = chunks.pop(0)
+        if len(chunks) > 0 and chunk[1] < max_tokens / 2:
+            next_chunk = chunks.pop(0)
+            combined_length = chunk[1] + next_chunk[1]
+            if combined_length <= max_tokens:
+                adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length))
             else:
+                adjusted_chunks.append(chunk)
+                chunks.insert(0, next_chunk)
         else:
+            adjusted_chunks.append(chunk)
+    result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks]
+    return result_chunks
 def predict_quillbot(text, bias_buster_selected):
         tokenizer = humanizer_tokenizer_mini
     elif model_type == "bc":
         cleaning = True
+        max_length = bc_token_size
         model = bc_model_mini
         tokenizer = bc_tokenizer_mini
     else:
         return output_norm
 def predict_bc_scores(input):
     bc_scores = []
     samples_len_bc = len(
     for score, label in zip(mc_score_list, mc_label_map):
         mc_score[label.upper()] = score
     sum_prob = 1 - bc_score["HUMAN"]
     for key, value in mc_score.items():
         mc_score[key] = value * sum_prob