copyright_checker / analysis.py
minko186's picture
refactoring
45d10c4
raw
history blame
3.03 kB
import requests
import httpx
import torch
import re
from bs4 import BeautifulSoup
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import asyncio
from scipy.special import softmax
from evaluate import load
from datetime import date
import nltk
import fitz
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import nltk, spacy, subprocess, torch
import plotly.graph_objects as go
import torch.nn.functional as F
import nltk
from unidecode import unidecode
import time
import yaml
import nltk
import os
from explainability import *
from dotenv import load_dotenv
import subprocess
nltk.download("punkt")
nltk.download("stopwords")
load_dotenv()
with open("config.yaml", "r") as file:
params = yaml.safe_load(file)
device = "cuda" if torch.cuda.is_available() else "cpu"
readability_model_id = params["READABILITY_MODEL_ID"]
gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
command = ["python", "-m", "spacy", "download", "en_core_web_sm"]
subprocess.run(command)
nlp = spacy.load("en_core_web_sm")
def depth_analysis(input_text):
processed_words = preprocess_text1(input_text)
ttr_value = vocabulary_richness_ttr(processed_words)
gunning_fog = calculate_gunning_fog(input_text)
gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
words, sentences = preprocess_text2(input_text)
average_sentence_length = calculate_average_sentence_length(sentences)
average_word_length = calculate_average_word_length(words)
average_sentence_length_norm = normalize(
average_sentence_length, min_value=0, max_value=40
)
average_word_length_norm = normalize(
average_word_length, min_value=0, max_value=8
)
average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
average_tree_depth_norm = normalize(
average_tree_depth, min_value=0, max_value=10
)
perplexity = calculate_perplexity(
input_text, gpt2_model, gpt2_tokenizer, device
)
perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
features = {
"readability": gunning_fog_norm,
"syntactic tree depth": average_tree_depth_norm,
"vocabulary richness": ttr_value,
"perplexity": perplexity_norm,
"average sentence length": average_sentence_length_norm,
"average word length": average_word_length_norm,
}
fig = go.Figure()
fig.add_trace(
go.Scatterpolar(
r=list(features.values()),
theta=list(features.keys()),
fill="toself",
name="Radar Plot",
)
)
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 100],
)
),
showlegend=False,
margin=dict(
l=10,
r=20,
b=10,
t=10,
),
)
return fig