File size: 2,962 Bytes
9703df0
 
 
 
 
 
 
 
 
 
 
 
 
c3b1412
9703df0
 
 
 
 
 
 
 
c3b1412
 
 
 
4eaff6c
c3b1412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8736bc8
c3b1412
 
fd2d594
c3b1412
 
 
 
 
9703df0
7398647
c3b1412
836051d
c3b1412
 
 
836051d
c3b1412
 
 
 
 
9703df0
7398647
c3b1412
836051d
c3b1412
 
 
836051d
e51f461
c3b1412
 
 
 
9703df0
7398647
c3b1412
836051d
e51f461
c3b1412
9703df0
 
c3b1412
9703df0
 
c3b1412
9703df0
 
 
c3b1412
 
 
9703df0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr

import nltk
import simplemma
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from simplemma import text_lemmatizer
nltk.download('punkt')

file = "text.txt"

import spacy
nlp_IT = spacy.load("it_core_news_sm")

def get_lists(file):
  with open(file, 'r', encoding='utf-8') as f:
    text = f.read()

  sent_tokenized_text = sent_tokenize(text, language='italian')
  sent_tokenized_text_lower = [sent.lower() for sent in sent_tokenized_text]

  return sent_tokenized_text, sent_tokenized_text_lower

sentences, sentences_lower = get_lists(file)

def search_engine_collocations(target = 'scarto' , colloc = 'azioni' , nlp = nlp_IT, sentences_lower = sentences_lower, sentences = sentences):
  
  verbs = []
  adjectives = []
  nouns = []
  result = 0
  
  for i,sent in enumerate(sentences_lower):
    if target.lower() in sent:
      result += 1
      doc = nlp(sent)
      for token in doc:
        if 'VERB' in token.pos_: 
          verbs.append(token.lemma_)
        elif 'ADJ' in token.pos_: 
          adjectives.append(token.lemma_)
        elif 'NOUN' in token.pos_: 
          nouns.append(token.lemma_)


  if result == 0:
    return "Non ho trovato la parola '{target}'.\n\n"
  
  else:
    if colloc == 'azioni' and verbs != []:
        verbs_fdist = FreqDist(verbs)
      
        stringed_results = ''
        for n,r in enumerate(verbs_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(verbs)} azioni legate a '{target}'\n{stringed_results}\n\n"
      
    elif verbs == []:
         return f"Non ho trovato azioni legate a '{target}'"
         
     
    if colloc == 'caratteristiche' and adjectives != []:
        adj_fdist = FreqDist(adjectives)
      
        stringed_results = ''
        for n,r in enumerate(adj_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(adjectives)} caratteristiche legate a '{target}'\n{stringed_results}\n\n"
      
    elif adjectives == []:
         return f"Non ho trovato caratteristiche legate a '{target}'"
      
      
    if colloc == 'concetti' and nouns != []:
        nouns_fdist = FreqDist(nouns)
      
        stringed_results = ''
        for n,r in enumerate(nouns_fdist.most_common()):
            stringed_results += str(n+1) + ': ' + str(r) + '\n\n'

        return f"Ho trovato {len(nouns)} concetti legati a '{target}'\n{stringed_results}\n\n"
      
    elif nouns == []:
         return f"Non ho trovato concetti legati a '{target}'"
         

demo = gr.Interface(
    search_engine_collocations,
    [
        gr.Textbox(),
        gr.Radio(["azioni", "caratteristiche", "concetti"]),
    ],
    "text",
    examples=[
        ["scarto", "azioni"],
        ["rifiuto", "caratteristiche"],
        ["sostenibilità", "concetti"],
    ],
)

demo.launch()