File size: 5,202 Bytes
f0d8b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1c221e
617e0ad
f0d8b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a28e60
f0d8b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2648d64
f0d8b59
 
 
 
 
 
cd0e30d
5a28e60
f0d8b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd0e30d
f0d8b59
f6e46c4
cd0e30d
 
a362359
5a28e60
f0d8b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b70843c
f0d8b59
 
 
057e141
 
 
f1c221e
5a28e60
 
91ad1e0
b70843c
91ad1e0
92a650e
f1c221e
f0d8b59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import streamlit as st
from collections import Counter
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing import text,sequence
from tensorflow.keras.preprocessing.text import Tokenizer

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

from textblob import TextBlob, Blobber
from textblob_fr import PatternTagger, PatternAnalyzer

import spacy.cli
spacy.cli.download("fr_core_news_md")

import torch
import sentencepiece as spm
from transformers import CamembertTokenizer, CamembertModel
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity


# nombre de mots et de mots uniques
def number_words(text):
  word = text.split()
  return 'Nombre de mots : {} - Nombre de mots uniques : {}'.format(len(word), len(Counter(word)))
    
# polarité
def polarity(text):
  tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
  if tb(text).sentiment[0] < 0:
      return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est plus négatif que positif'
  elif tb(text).sentiment[0] > 0:
      return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est plus positif que négatif'
  else :
      return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est neutre, pas plus négatif que positif'


# subjectivité
def subjectivity(text):
  tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
  if tb(text).sentiment[1] < 0.5:
    return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est plus subjectif que factuel'
  elif tb(text).sentiment[1] > 0.5:
    return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est plus subjectif que factuel'
  else :
    return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est neutre, pas plus subjectif que factuel'


# mots clés
def keywords(text):
  nlp = spacy.load("fr_core_news_md")
  text2 = nlp(text)
  text_keywords = [token.text for token in text2 if token.pos_== 'NOUN' or token.pos_== 'PROPN' or token.pos_== 'VERB']
  counter_words = Counter(text_keywords)
  most_freq_words = [word for word in counter_words.most_common(10)]
  most_freq_words_p = []
  for i in range(len(most_freq_words)):
    mfwp = most_freq_words[i][0]
    most_freq_words_p.append(mfwp)
  return most_freq_words_p


# summary1
def summary_1(text):
  model = CamembertModel.from_pretrained('camembert-base')
  tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

  ## preprocessing
  sentences = sent_tokenize(text)
  tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

  ## padding, encoding
  max_len = 0
  for i in tokenized_sentences:
    if len(i) > max_len:
      max_len = len(i)

  padded_sentences = []
  for i in tokenized_sentences:
    while len(i) < max_len:
      i.append(0)
    padded_sentences.append(i)

  input_ids = torch.tensor(padded_sentences)

  ## embedding
  with torch.no_grad():
    last_hidden_states = model(input_ids)[0]

  sentence_embeddings = []
  for i in range(len(sentences)):
    sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())

  ## summarizing
  similarity_matrix = cosine_similarity(sentence_embeddings)

  num_sentences = 3
  summary_sentences = []
  for i in range(num_sentences):
    sentence_scores = list(enumerate(similarity_matrix[i]))
    sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
    summary_sentences.append(sentences[sentence_scores[1][0]])

  summary = ' '.join(summary_sentences)
  return summary


# summary2
def summary_2(text):
  nlp = spacy.load("fr_core_news_md")
  text2 = nlp(text)
  text_keywords = [token.text for token in text2 if token.pos_== 'NOUN' or token.pos_== 'PROPN']

  counter_words = Counter(text_keywords)
  most_freq_words = [word for word in counter_words.most_common(3)]

  most_freq_words_p = []
  for i in range(len(most_freq_words)):
      mfwp = most_freq_words[i][0]
      most_freq_words_p.append(mfwp)

  sentences = sent_tokenize(text)
  summary = []
  for sent in sentences:
    for word in sent.split():
      if word in most_freq_words_p and sent not in summary:
        summary.append(sent)
      
  return summary


def analyze_text(text):
    nb_mots = number_words(text)
    polarite = polarity(text)
    subjectivite = subjectivity(text)
    mots_cles = keywords(text)
    resume1 = summary_1(text)
    resume2 = summary_2(text)

    return nb_mots, polarite, subjectivite, mots_cles, resume1, resume2


st.title("Text Analysis and Summary")
text = st.text_area("Enter text here:")

if st.button("Analyze"):
  if text:
    nb_mots, polarite, subjectivite, mots_cles, resume1, resume2 = analyze_text(text)

    st.write(nb_mots)
    st.write(polarite)
    st.write(subjectivite)
    st.write('Mots clés :', ', '.join(mots_cles))
    st.write(f'Résumé 1 : {resume1}')
    st.write(f'Résumé 2 : {resume2}')

if st.button("Clear Text Area"):
    text = ""
    
st.text_area("Enter text here:", value=text)