File size: 5,202 Bytes
f0d8b59 f1c221e 617e0ad f0d8b59 5a28e60 f0d8b59 2648d64 f0d8b59 cd0e30d 5a28e60 f0d8b59 cd0e30d f0d8b59 f6e46c4 cd0e30d a362359 5a28e60 f0d8b59 b70843c f0d8b59 057e141 f1c221e 5a28e60 91ad1e0 b70843c 91ad1e0 92a650e f1c221e f0d8b59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import streamlit as st
from collections import Counter
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing import text,sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk'punkt')
from nltk.tokenize import word_tokenize'stopwords')
from nltk.corpus import stopwords'wordnet')
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob, Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
import spacy.cli"fr_core_news_md")
import torch
import sentencepiece as spm
from transformers import CamembertTokenizer, CamembertModel
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
# nombre de mots et de mots uniques
def number_words(text):
word = text.split()
return 'Nombre de mots : {} - Nombre de mots uniques : {}'.format(len(word), len(Counter(word)))
# polarité
def polarity(text):
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
if tb(text).sentiment[0] < 0:
return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est plus négatif que positif'
elif tb(text).sentiment[0] > 0:
return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est plus positif que négatif'
else :
return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est neutre, pas plus négatif que positif'
# subjectivité
def subjectivity(text):
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
if tb(text).sentiment[1] < 0.5:
return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est plus subjectif que factuel'
elif tb(text).sentiment[1] > 0.5:
return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est plus subjectif que factuel'
else :
return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est neutre, pas plus subjectif que factuel'
# mots clés
def keywords(text):
nlp = spacy.load("fr_core_news_md")
text2 = nlp(text)
text_keywords = [token.text for token in text2 if token.pos_== 'NOUN' or token.pos_== 'PROPN' or token.pos_== 'VERB']
counter_words = Counter(text_keywords)
most_freq_words = [word for word in counter_words.most_common(10)]
most_freq_words_p = []
for i in range(len(most_freq_words)):
mfwp = most_freq_words[i][0]
return most_freq_words_p
# summary1
def summary_1(text):
model = CamembertModel.from_pretrained('camembert-base')
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
## preprocessing
sentences = sent_tokenize(text)
tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
## padding, encoding
max_len = 0
for i in tokenized_sentences:
if len(i) > max_len:
max_len = len(i)
padded_sentences = []
for i in tokenized_sentences:
while len(i) < max_len:
input_ids = torch.tensor(padded_sentences)
## embedding
with torch.no_grad():
last_hidden_states = model(input_ids)[0]
sentence_embeddings = []
for i in range(len(sentences)):
sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())
## summarizing
similarity_matrix = cosine_similarity(sentence_embeddings)
num_sentences = 3
summary_sentences = []
for i in range(num_sentences):
sentence_scores = list(enumerate(similarity_matrix[i]))
sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
summary = ' '.join(summary_sentences)
return summary
# summary2
def summary_2(text):
nlp = spacy.load("fr_core_news_md")
text2 = nlp(text)
text_keywords = [token.text for token in text2 if token.pos_== 'NOUN' or token.pos_== 'PROPN']
counter_words = Counter(text_keywords)
most_freq_words = [word for word in counter_words.most_common(3)]
most_freq_words_p = []
for i in range(len(most_freq_words)):
mfwp = most_freq_words[i][0]
sentences = sent_tokenize(text)
summary = []
for sent in sentences:
for word in sent.split():
if word in most_freq_words_p and sent not in summary:
return summary
def analyze_text(text):
nb_mots = number_words(text)
polarite = polarity(text)
subjectivite = subjectivity(text)
mots_cles = keywords(text)
resume1 = summary_1(text)
resume2 = summary_2(text)
return nb_mots, polarite, subjectivite, mots_cles, resume1, resume2
st.title("Text Analysis and Summary")
text = st.text_area("Enter text here:")
if st.button("Analyze"):
if text:
nb_mots, polarite, subjectivite, mots_cles, resume1, resume2 = analyze_text(text)
st.write('Mots clés :', ', '.join(mots_cles))
st.write(f'Résumé 1 : {resume1}')
st.write(f'Résumé 2 : {resume2}')
if st.button("Clear Text Area"):
text = ""
st.text_area("Enter text here:", value=text)