Spaces:
Runtime error
Runtime error
import streamlit as st | |
from bs4 import BeautifulSoup | |
import requests | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.tag import pos_tag | |
from nltk.chunk import ne_chunk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import numpy as np | |
from transformers import BertTokenizer, BertForTokenClassification | |
from transformers import pipeline | |
# Descargar los recursos necesarios | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('maxent_ne_chunker') | |
nltk.download('words') | |
# Descargar recursos necesarios | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
import pandas as pd | |
from tqdm import tqdm | |
import spacy | |
# scrapy | |
def scrapy(): | |
# Start a session | |
session = requests.Session() | |
# Define the headers | |
headers = { | |
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7", | |
"Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv", | |
"Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" | |
} | |
# Update the session with the headers | |
session.headers.update(headers) | |
# Now you can make requests with this session and the headers will be used automatically | |
response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply") | |
# Parse the content with BeautifulSoup | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Initialize the variable that will contain the full content | |
paragraphs = [] | |
# Extract the title and add it to the content | |
title = soup.find('h1') | |
if title: | |
paragraphs.append(title.get_text()) | |
# Iterate through the article content and add each element in the order it appears | |
for element in soup.find_all(['h2', 'blockquote', 'p']): | |
paragraphs.append(element.get_text()) | |
# print(paragraphs) | |
paragraphs = paragraphs[:-3] | |
return '\n'.join( paragraphs) | |
def reconocedor_de_entidades(texto): | |
# Tokenización | |
palabras = word_tokenize(texto) | |
# Etiquetado gramatical (POS tagging) | |
palabras_etiquetadas = pos_tag(palabras) | |
# Reconocimiento de entidades nombradas | |
arbol_entidades = ne_chunk(palabras_etiquetadas) | |
# Extraer entidades del árbol | |
entidades = [] | |
for subtree in arbol_entidades: | |
if isinstance(subtree, nltk.Tree): | |
entidad = " ".join([word for word, tag in subtree.leaves()]) | |
etiqueta = subtree.label() | |
entidades.append((entidad, etiqueta)) | |
return entidades | |
def extract_entity(text_list): | |
# A NER pipeline is set up, and entities from text_list are added to the entities list. | |
entities = [] | |
for paragraph in tqdm(text_list): | |
entity = nlp_ner(paragraph) | |
entities.extend(entity) | |
# Delete duplicates | |
seen_words = set() | |
unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])] | |
return unique_entities | |
def download_bert() : | |
model_bert_large_name = "dbmdz/bert-large-cased-finetuned-conll03-english" | |
model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name ) | |
tokenizer_bert_large = BertTokenizer.from_pretrained(model_bert_large_name ) | |
nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large) | |
return nlp_ner | |
if __name__=='__main__': | |
text = scrapy() | |
st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)") | |
nlp_bert = download_bert() | |
st.write('BERT download') | |
progress_text = "Operation in progress. Please wait." | |
total_paragraphs = len(text) | |
my_bar = st.progress(0, text=progress_text) | |
entities_bert = [] | |
for i, paragraph in enumerate(text): | |
# Update the progress bar with the current progress | |
percent_complete = (i + 1) / total_paragraphs | |
my_bar.progress(percent_complete, text=progress_text) | |
# Process the current paragraph using spaCy's BERT-based model | |
entity = nlp_bert(paragraph) # Assuming nlp_bert is correctly defined | |
entities_bert.extend(entity) | |
# When the loop is complete, set the progress to 100% | |
my_bar.progress(1.0, text=progress_text) | |