nlpproyect / app.py
blasblues's picture
Create app.py
a9d9e0a
import streamlit as st
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline
# Descargar los recursos necesarios
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# Descargar recursos necesarios
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from tqdm import tqdm
import spacy
# scrapy
def scrapy():
# Start a session
session = requests.Session()
# Define the headers
headers = {
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7",
"Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv",
"Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
# Update the session with the headers
session.headers.update(headers)
# Now you can make requests with this session and the headers will be used automatically
response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply")
# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Initialize the variable that will contain the full content
paragraphs = []
# Extract the title and add it to the content
title = soup.find('h1')
if title:
paragraphs.append(title.get_text())
# Iterate through the article content and add each element in the order it appears
for element in soup.find_all(['h2', 'blockquote', 'p']):
paragraphs.append(element.get_text())
# print(paragraphs)
paragraphs = paragraphs[:-3]
return '\n'.join( paragraphs)
def reconocedor_de_entidades(texto):
# Tokenización
palabras = word_tokenize(texto)
# Etiquetado gramatical (POS tagging)
palabras_etiquetadas = pos_tag(palabras)
# Reconocimiento de entidades nombradas
arbol_entidades = ne_chunk(palabras_etiquetadas)
# Extraer entidades del árbol
entidades = []
for subtree in arbol_entidades:
if isinstance(subtree, nltk.Tree):
entidad = " ".join([word for word, tag in subtree.leaves()])
etiqueta = subtree.label()
entidades.append((entidad, etiqueta))
return entidades
def extract_entity(text_list):
# A NER pipeline is set up, and entities from text_list are added to the entities list.
entities = []
for paragraph in tqdm(text_list):
entity = nlp_ner(paragraph)
entities.extend(entity)
# Delete duplicates
seen_words = set()
unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])]
return unique_entities
@st.cache_data()
def download_bert() :
model_bert_large_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name )
tokenizer_bert_large = BertTokenizer.from_pretrained(model_bert_large_name )
nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large)
return nlp_ner
if __name__=='__main__':
text = scrapy()
st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)")
nlp_bert = download_bert()
st.write('BERT download')
progress_text = "Operation in progress. Please wait."
total_paragraphs = len(text)
my_bar = st.progress(0, text=progress_text)
entities_bert = []
for i, paragraph in enumerate(text):
# Update the progress bar with the current progress
percent_complete = (i + 1) / total_paragraphs
my_bar.progress(percent_complete, text=progress_text)
# Process the current paragraph using spaCy's BERT-based model
entity = nlp_bert(paragraph) # Assuming nlp_bert is correctly defined
entities_bert.extend(entity)
# When the loop is complete, set the progress to 100%
my_bar.progress(1.0, text=progress_text)