Spaces:

blasblues
/

nlpproyect

Runtime error

File size: 5,143 Bytes

a9d9e0a

import streamlit as st
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline 
# Descargar los recursos necesarios
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# Descargar recursos necesarios
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from tqdm import tqdm
import spacy

# scrapy
def scrapy():
	# Start a session
	session = requests.Session()

	# Define the headers
	headers = {
	    "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
	    "Accept-Encoding": "gzip, deflate, br",
	    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7",
	    "Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv",
	    "Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply",
	    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
	}

	# Update the session with the headers
	session.headers.update(headers)

	# Now you can make requests with this session and the headers will be used automatically
	response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply")

	# Parse the content with BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Initialize the variable that will contain the full content
	paragraphs = []

	# Extract the title and add it to the content
	title = soup.find('h1')
	if title:
	    paragraphs.append(title.get_text())

	# Iterate through the article content and add each element in the order it appears
	for element in soup.find_all(['h2', 'blockquote', 'p']):
	    paragraphs.append(element.get_text())

	# print(paragraphs)

	paragraphs = paragraphs[:-3]


	return '\n'.join( paragraphs)


def reconocedor_de_entidades(texto):
    # Tokenización
    palabras = word_tokenize(texto)

    # Etiquetado gramatical (POS tagging)
    palabras_etiquetadas = pos_tag(palabras)

    # Reconocimiento de entidades nombradas
    arbol_entidades = ne_chunk(palabras_etiquetadas)

    # Extraer entidades del árbol
    entidades = []
    for subtree in arbol_entidades:
        if isinstance(subtree, nltk.Tree):
            entidad = " ".join([word for word, tag in subtree.leaves()])
            etiqueta = subtree.label()
            entidades.append((entidad, etiqueta))

    return entidades


def extract_entity(text_list):
	# A NER pipeline is set up, and entities from text_list are added to the entities list.
	entities = []
	for paragraph in tqdm(text_list):
		entity = nlp_ner(paragraph)
		entities.extend(entity)

		# Delete duplicates
		seen_words = set()
		unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])]

	return unique_entities


@st.cache_data()
def download_bert() :
	model_bert_large_name  = "dbmdz/bert-large-cased-finetuned-conll03-english"

	model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name )
	tokenizer_bert_large   = BertTokenizer.from_pretrained(model_bert_large_name )
	nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large)

	return nlp_ner




if __name__=='__main__':

	text = scrapy()


	
	st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)")

	nlp_bert = download_bert()

	st.write('BERT download')



	progress_text = "Operation in progress. Please wait."
	total_paragraphs = len(text)
	my_bar = st.progress(0, text=progress_text)

	entities_bert = []

	for i, paragraph in enumerate(text):
	    # Update the progress bar with the current progress
	    percent_complete = (i + 1) / total_paragraphs
	    my_bar.progress(percent_complete, text=progress_text)
	    
	    # Process the current paragraph using spaCy's BERT-based model
	    entity = nlp_bert(paragraph)  # Assuming nlp_bert is correctly defined
	    entities_bert.extend(entity)

	# When the loop is complete, set the progress to 100%
	my_bar.progress(1.0, text=progress_text)