Spaces:

blasblues
/

nlpproyect

Runtime error

App Files Files Community

nlpproyect / app.py

blasblues

Create app.py

a9d9e0a about 2 years ago

raw

history blame contribute delete

5.14 kB

	import streamlit as st
	from bs4 import BeautifulSoup
	import requests
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.tag import pos_tag
	from nltk.chunk import ne_chunk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize, sent_tokenize
	from sklearn.feature_extraction.text import TfidfVectorizer
	import numpy as np
	from transformers import BertTokenizer, BertForTokenClassification
	from transformers import pipeline
	# Descargar los recursos necesarios
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('maxent_ne_chunker')
	nltk.download('words')
	# Descargar recursos necesarios
	nltk.download('punkt')
	nltk.download('stopwords')
	import pandas as pd
	from tqdm import tqdm
	import spacy

	# scrapy
	def scrapy():
	# Start a session
	session = requests.Session()

	# Define the headers
	headers = {
	"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/,/*;q=0.8",
	"Accept-Encoding": "gzip, deflate, br",
	"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7",
	"Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv",
	"Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply",
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
	}

	# Update the session with the headers
	session.headers.update(headers)

	# Now you can make requests with this session and the headers will be used automatically
	response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply")

	# Parse the content with BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Initialize the variable that will contain the full content
	paragraphs = []

	# Extract the title and add it to the content
	title = soup.find('h1')
	if title:
	paragraphs.append(title.get_text())

	# Iterate through the article content and add each element in the order it appears
	for element in soup.find_all(['h2', 'blockquote', 'p']):
	paragraphs.append(element.get_text())

	# print(paragraphs)

	paragraphs = paragraphs[:-3]


	return '\n'.join( paragraphs)


	def reconocedor_de_entidades(texto):
	# Tokenización
	palabras = word_tokenize(texto)

	# Etiquetado gramatical (POS tagging)
	palabras_etiquetadas = pos_tag(palabras)

	# Reconocimiento de entidades nombradas
	arbol_entidades = ne_chunk(palabras_etiquetadas)

	# Extraer entidades del árbol
	entidades = []
	for subtree in arbol_entidades:
	if isinstance(subtree, nltk.Tree):
	entidad = " ".join([word for word, tag in subtree.leaves()])
	etiqueta = subtree.label()
	entidades.append((entidad, etiqueta))

	return entidades


	def extract_entity(text_list):
	# A NER pipeline is set up, and entities from text_list are added to the entities list.
	entities = []
	for paragraph in tqdm(text_list):
	entity = nlp_ner(paragraph)
	entities.extend(entity)

	# Delete duplicates
	seen_words = set()
	unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])]

	return unique_entities


	@st.cache_data()
	def download_bert() :
	model_bert_large_name = "dbmdz/bert-large-cased-finetuned-conll03-english"

	model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name )
	tokenizer_bert_large = BertTokenizer.from_pretrained(model_bert_large_name )
	nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large)

	return nlp_ner




	if __name__=='__main__':

	text = scrapy()



	st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)")

	nlp_bert = download_bert()

	st.write('BERT download')



	progress_text = "Operation in progress. Please wait."
	total_paragraphs = len(text)
	my_bar = st.progress(0, text=progress_text)

	entities_bert = []

	for i, paragraph in enumerate(text):
	# Update the progress bar with the current progress
	percent_complete = (i + 1) / total_paragraphs
	my_bar.progress(percent_complete, text=progress_text)

	# Process the current paragraph using spaCy's BERT-based model
	entity = nlp_bert(paragraph) # Assuming nlp_bert is correctly defined
	entities_bert.extend(entity)

	# When the loop is complete, set the progress to 100%
	my_bar.progress(1.0, text=progress_text)