import gradio as gr import nltk import simplemma from nltk.tokenize import word_tokenize from nltk.tokenize import sent_tokenize from nltk.probability import FreqDist from simplemma import text_lemmatizer nltk.download('punkt') file = "text.txt" import spacy nlp_IT = spacy.load("it_core_news_sm") def get_lists(file): with open(file, 'r', encoding='utf-8') as f: text = f.read() sent_tokenized_text = sent_tokenize(text, language='italian') sent_tokenized_text_lower = [sent.lower() for sent in sent_tokenized_text] return sent_tokenized_text, sent_tokenized_text_lower sentences, sentences_lower = get_lists(file) def search_engine_collocations(target = 'scarto' , colloc = 'azioni' , nlp = nlp_IT, sentences_lower = sentences_lower, sentences = sentences): verbs = [] adjectives = [] nouns = [] result = 0 for i,sent in enumerate(sentences_lower): if target.lower() in sent: result += 1 doc = nlp(sent) for token in doc: if 'VERB' in token.pos_: verbs.append(token.lemma_) elif 'ADJ' in token.pos_: adjectives.append(token.lemma_) elif 'NOUN' in token.pos_: nouns.append(token.lemma_) if result == 0: return "Non ho trovato la parola '{target}'.\n\n" else: if colloc == 'azioni' and verbs != []: verbs_fdist = FreqDist(verbs) stringed_results = '' for n,r in enumerate(verbs_fdist.most_common()): stringed_results += str(n+1) + ': ' + str(r) + '\n\n' return f"Ho trovato {len(verbs)} azioni legate a '{target}'\n{stringed_results}\n\n" elif verbs == []: return f"Non ho trovato azioni legate a '{target}'" if colloc == 'caratteristiche' and adjectives != []: adj_fdist = FreqDist(adjectives) stringed_results = '' for n,r in enumerate(adj_fdist.most_common()): stringed_results += str(n+1) + ': ' + str(r) + '\n\n' return f"Ho trovato {len(adjectives)} caratteristiche legate a '{target}'\n{stringed_results}\n\n" elif adjectives == []: return f"Non ho trovato caratteristiche legate a '{target}'" if colloc == 'concetti' and nouns != []: nouns_fdist = FreqDist(nouns) stringed_results = '' for n,r in enumerate(nouns_fdist.most_common()): stringed_results += str(n+1) + ': ' + str(r) + '\n\n' return f"Ho trovato {len(nouns)} concetti legati a '{target}'\n{stringed_results}\n\n" elif nouns == []: return f"Non ho trovato concetti legati a '{target}'" demo = gr.Interface( search_engine_collocations, [ gr.Textbox(), gr.Radio(["azioni", "caratteristiche", "concetti"]), ], "text", examples=[ ["scarto", "azioni"], ["rifiuto", "caratteristiche"], ["sostenibilità", "concetti"], ], ) demo.launch()