import streamlit as st from bs4 import BeautifulSoup import requests import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.chunk import ne_chunk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from transformers import BertTokenizer, BertForTokenClassification from transformers import pipeline # Descargar los recursos necesarios nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('maxent_ne_chunker') nltk.download('words') # Descargar recursos necesarios nltk.download('punkt') nltk.download('stopwords') import pandas as pd from tqdm import tqdm import spacy # scrapy def scrapy(): # Start a session session = requests.Session() # Define the headers headers = { "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7", "Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv", "Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" } # Update the session with the headers session.headers.update(headers) # Now you can make requests with this session and the headers will be used automatically response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply") # Parse the content with BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Initialize the variable that will contain the full content paragraphs = [] # Extract the title and add it to the content title = soup.find('h1') if title: paragraphs.append(title.get_text()) # Iterate through the article content and add each element in the order it appears for element in soup.find_all(['h2', 'blockquote', 'p']): paragraphs.append(element.get_text()) # print(paragraphs) paragraphs = paragraphs[:-3] return '\n'.join( paragraphs) def reconocedor_de_entidades(texto): # Tokenización palabras = word_tokenize(texto) # Etiquetado gramatical (POS tagging) palabras_etiquetadas = pos_tag(palabras) # Reconocimiento de entidades nombradas arbol_entidades = ne_chunk(palabras_etiquetadas) # Extraer entidades del árbol entidades = [] for subtree in arbol_entidades: if isinstance(subtree, nltk.Tree): entidad = " ".join([word for word, tag in subtree.leaves()]) etiqueta = subtree.label() entidades.append((entidad, etiqueta)) return entidades def extract_entity(text_list): # A NER pipeline is set up, and entities from text_list are added to the entities list. entities = [] for paragraph in tqdm(text_list): entity = nlp_ner(paragraph) entities.extend(entity) # Delete duplicates seen_words = set() unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])] return unique_entities @st.cache_data() def download_bert() : model_bert_large_name = "dbmdz/bert-large-cased-finetuned-conll03-english" model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name ) tokenizer_bert_large = BertTokenizer.from_pretrained(model_bert_large_name ) nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large) return nlp_ner if __name__=='__main__': text = scrapy() st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)") nlp_bert = download_bert() st.write('BERT download') progress_text = "Operation in progress. Please wait." total_paragraphs = len(text) my_bar = st.progress(0, text=progress_text) entities_bert = [] for i, paragraph in enumerate(text): # Update the progress bar with the current progress percent_complete = (i + 1) / total_paragraphs my_bar.progress(percent_complete, text=progress_text) # Process the current paragraph using spaCy's BERT-based model entity = nlp_bert(paragraph) # Assuming nlp_bert is correctly defined entities_bert.extend(entity) # When the loop is complete, set the progress to 100% my_bar.progress(1.0, text=progress_text)