Spaces:

NextGenLabs
/

ai_agents

Sleeping

File size: 6,948 Bytes

56a3465


import pymupdf
import tiktoken
import textstat
from docx import Document
import io
# from rake_nltk import Rake
# import nltk
# from nltk.corpus import stopwords
from openai import OpenAI

# Download NLTK stopwords
# nltk.download('stopwords')
# nltk.download('punkt')

#function to use gpt4o-mini
def extract_relevant_keywords(prompt: str) -> str:
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content


def evaluate_text_quality(text: str) -> dict:
    # Calculate readability metrics
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
    gunning_fog = textstat.gunning_fog(text)
    smog_index = textstat.smog_index(text)
    automated_readability_index = textstat.automated_readability_index(text)

    # Normalize readability scores to a 0-1 scale
    def normalize_score(score, min_score, max_score):
        return (score - min_score) / (max_score - min_score)

    # Normalize each readability score
    n_flesch_reading_ease = normalize_score(flesch_reading_ease, 0, 100)
    n_flesch_kincaid_grade = 1 - normalize_score(flesch_kincaid_grade, 0, 18)  # Higher is more difficult
    n_gunning_fog = 1 - normalize_score(gunning_fog, 0, 18)  # Higher is more difficult
    n_smog_index = 1 - normalize_score(smog_index, 0, 18)  # Higher is more difficult
    n_automated_readability_index = 1 - normalize_score(automated_readability_index, 0, 18)  # Higher is more difficult

    # Weights for each metric (adjust these as needed)
    weights = {
        "flesch_reading_ease": 0.25,
        "flesch_kincaid_grade": 0.25,
        "gunning_fog": 0.2,
        "smog_index": 0.15,
        "automated_readability_index": 0.15
    }

    # Calculate the global readability score
    global_score = (
        n_flesch_reading_ease * weights["flesch_reading_ease"] +
        n_flesch_kincaid_grade * weights["flesch_kincaid_grade"] +
        n_gunning_fog * weights["gunning_fog"] +
        n_smog_index * weights["smog_index"] +
        n_automated_readability_index * weights["automated_readability_index"]
    )

    # Scale the global score to 0-5
    global_score_0_5 = global_score * 5

# def extract_keywords(text):
#     rake = Rake(stopwords.words('french'))
#     rake.extract_keywords_from_text(text)
#     return rake.get_ranked_phrases()



def count_tokens(input_string: str) -> int:
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(input_string)
    return len(tokens)

def audit_descriptif_pdf(file,max_img_width) -> dict:
    document = pymupdf.open(stream=file.read())

    audit_dict_doc = {
        "number_of_pages": len(document),
        "number_of_images": 0,
        "number_of_links": 0,
        "number_of_tables": 0,
        "number_of_tokens": 0,
        "number_of_words": 0,
        "key_words": []
    }

    doc_content = dict()

    for page in document:

        audit_dict_page = {}
        page_content = {
            "images": [],
            "texte": "",
            "liens": [],
            "tableaux": []
        }

        #number of images
        images = page.get_images()
        number_images = len(images)
        audit_dict_page["number_of_images"] = number_images
        audit_dict_doc["number_of_images"] += number_images
        
        #get images
        for _, img in enumerate(images):
            xref = img[0]
            base_image = document.extract_image(xref)

            image_bytes = base_image["image"]
            image_width = base_image["width"]
            image_height = base_image["height"]
            
            # Adjust image size if it exceeds the maximum width
            if image_width > max_img_width:
                ratio = max_img_width / image_width
                image_width = max_img_width
                image_height = int(image_height * ratio)
            
            page_content["images"].append((image_bytes, image_width, image_height))


        
        #get links with uri
        links = []
        for link in page.get_links():
            if link['kind'] == pymupdf.LINK_URI and 'uri' in link:
                links.append({"uri": link["uri"], "page": page.number})
        
        page_content["liens"] = links

        #number of links
        number_links = len(links)
        audit_dict_page["number_of_links"] = number_links
        audit_dict_doc["number_of_links"] += number_links

        #number of tables
        tables = page.find_tables().tables
        number_tables = len(tables)
        for tab in tables:
            page_content["tableaux"].append(tab.to_pandas())
        audit_dict_page["number_of_tables"] = number_tables
        audit_dict_doc["number_of_tables"] += number_tables

        #number of tokens and words
        text = page.get_text("text")
        number_tokens = count_tokens(text)
        number_words = len(text.split())

        audit_dict_page["number_of_tokens"] = number_tokens
        audit_dict_page["number_of_words"] = number_words

        #get text
        page_content["texte"] = text

        audit_dict_doc["number_of_tokens"] += number_tokens
        audit_dict_doc["number_of_words"] += number_words

        audit_dict_doc[f"page_{page.number}"] = audit_dict_page

        doc_content[f"page_{page.number}"] = page_content
    
    # Extract key words from the document
    text = " ".join([page["texte"] for page in doc_content.values()])
    # key_words = extract_keywords(text)
    # list_key_words_text = "\n".join(key_words[:10])
    prompt = f'''Voici le document:
        - {text}
        Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots.

        TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT :
        key_word1, key_word2, key_word3, key_word4, key_word5
    '''
    key_words_extracted = extract_relevant_keywords(prompt)
    audit_dict_doc["key_words"] = "\n" + key_words_extracted

    #merge 2 dicts
    global_audit = {
        "audit": audit_dict_doc,
        "content": doc_content
    }

    return global_audit

def audit_text(text: str) -> dict:

    prompt = f'''Voici le document:
        - {text}
        Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots.

        TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT :
        key_word1, key_word2, key_word3, key_word4, key_word5
    '''
    key_words_extracted = extract_relevant_keywords(prompt)
    

    audit_dict = {
        "number_of_tokens": count_tokens(text),
        "number_of_words": len(text.split()),
    }

    audit_dict["key_words"] = "\n" + key_words_extracted

    global_audit = {
        "audit": audit_dict,
        "content": text
    }

    return global_audit