text_to_summary / app.py
sy-lac's picture
text_to_summary
f0d8b59
raw
history blame
5.26 kB
import streamlit as st
from collections import Counter
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing import text,sequence
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob, Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
import spacy.cli
spacy.cli.download("fr_core_news_md")
import torch
import sentencepiece as spm
from transformers import CamembertTokenizer, CamembertModel
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
# nombre de mots et de mots uniques
def number_words(text):
word = text.split()
return f'Nombre de mots : {len(word)}', f'Nombre de mots uniques : {len(Counter(word))}'
# polarité
def polarity(text):
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
if tb(text).sentiment[0] < 0:
return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est plus négatif que positif'
elif tb(text).sentiment[0] > 0:
return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est plus positif que négatif'
else :
return f'La polarité de ce texte est {tb(text).sentiment[0]} : ce texte est neutre, pas plus négatif que positif'
# subjectivité
def subjectivity(text):
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
if tb(text).sentiment[1] < 0.5:
return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est plus subjectif que factuel'
elif tb(text).sentiment[1] > 0.5:
return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est plus subjectif que factuel'
else :
return f'La subjectivité de ce texte est {tb(text).sentiment[1]} : ce texte est neutre, pas plus subjectif que factuel'
# mots clés
def keywords(text):
nlp = spacy.load("fr_core_news_md")
text2 = nlp(text)
text_keywords = [token.text for token in text2 if token.pos_== 'NOUN' or token.pos_== 'PROPN' or token.pos_== 'VERB']
counter_words = Counter(text_keywords)
most_freq_words = [word for word in counter_words.most_common(10)]
most_freq_words_p = []
for i in range(len(most_freq_words)):
mfwp = most_freq_words[i][0]
most_freq_words_p.append(mfwp)
return 'mots clés :', ', '.join(most_freq_words_p)
# summary1
def summary_1(text):
model = CamembertModel.from_pretrained('camembert-base')
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
## preprocessing
sentences = sent_tokenize(text)
tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
## padding, encoding
max_len = 0
for i in tokenized_sentences:
if len(i) > max_len:
max_len = len(i)
padded_sentences = []
for i in tokenized_sentences:
while len(i) < max_len:
i.append(0)
padded_sentences.append(i)
input_ids = torch.tensor(padded_sentences)
## embedding
with torch.no_grad():
last_hidden_states = model(input_ids)[0]
sentence_embeddings = []
for i in range(len(sentences)):
sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())
## summarizing
similarity_matrix = cosine_similarity(sentence_embeddings)
num_sentences = 2
summary_sentences = []
for i in range(num_sentences):
sentence_scores = list(enumerate(similarity_matrix[i]))
sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
summary_sentences.append(sentences[sentence_scores[1][0]])
summary1 = ' '.join(summary_sentences)
return summary1
# summary2
def summary_2(text):
nlp = spacy.load("fr_core_news_md")
text2 = nlp(text)
text_keywords = [token.text for token in text2 if token.pos_== 'NOUN' or token.pos_== 'PROPN']
counter_words = Counter(text_keywords)
most_freq_words = [word for word in counter_words.most_common(3)]
most_freq_words_p = []
for i in range(len(most_freq_words)):
mfwp = most_freq_words[i][0]
most_freq_words_p.append(mfwp)
sentences = sent_tokenize(text)
summary2 = []
for sent in sentences:
words_in_sentence = word_tokenize(sent)
common_words = set(words_in_sentence).intersection(most_freq_words)
if common_words:
summary2.append(sent)
return summary2
def analyze_text(text):
nb_mots = number_words(text)
polarite = polarity(text)
subjectivite = subjectivity(text)
mots_cles = keywords(text)
resume1 = summary_1(text)
resume2 = summary_2(text)
return nb_mots, polarite, subjectivite, mots_cles, resume1, resume2
st.title("Text Analysis and Summary")
text = st.text_area("Enter text here:")
if st.button("Analyze"):
if text:
nb_mots, polarite, subjectivite, mots_cles, resume1, resume2 = analyze_text(text)
st.write(f'Nombre de mots : {nb_mots}')
st.write(f'Polarité : {polarite}')
st.write(f'Subjectivité : {subjectivite}')
st.write(f'Mots clés : {", ".join(mots_cles)}')
st.write(f'Résumé 1 : {resume1}')
st.write(f'Résumé 2 :')
for sent in resume2:
st.write(sent)