Spaces:

blasblues
/

nlpproyect

Runtime error

App Files Files Community

blasblues commited on Sep 4, 2023

Commit

a9d9e0a

•

1 Parent(s): de4c092

Create app.py

Browse files

Files changed (1) hide show

app.py +154 -0

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import streamlit as st
+from bs4 import BeautifulSoup
+import requests
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.tag import pos_tag
+from nltk.chunk import ne_chunk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+from transformers import BertTokenizer, BertForTokenClassification
+from transformers import pipeline
+# Descargar los recursos necesarios
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('maxent_ne_chunker')
+nltk.download('words')
+# Descargar recursos necesarios
+nltk.download('punkt')
+nltk.download('stopwords')
+import pandas as pd
+from tqdm import tqdm
+import spacy
+# scrapy
+def scrapy():
+	# Start a session
+	session = requests.Session()
+	# Define the headers
+	headers = {
+	    "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
+	    "Accept-Encoding": "gzip, deflate, br",
+	    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7",
+	    "Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv",
+	    "Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply",
+	    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
+	}
+	# Update the session with the headers
+	session.headers.update(headers)
+	# Now you can make requests with this session and the headers will be used automatically
+	response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply")
+	# Parse the content with BeautifulSoup
+	soup = BeautifulSoup(response.content, 'html.parser')
+	# Initialize the variable that will contain the full content
+	paragraphs = []
+	# Extract the title and add it to the content
+	title = soup.find('h1')
+	if title:
+	    paragraphs.append(title.get_text())
+	# Iterate through the article content and add each element in the order it appears
+	for element in soup.find_all(['h2', 'blockquote', 'p']):
+	    paragraphs.append(element.get_text())
+	# print(paragraphs)
+	paragraphs = paragraphs[:-3]
+	return '\n'.join( paragraphs)
+def reconocedor_de_entidades(texto):
+    # Tokenización
+    palabras = word_tokenize(texto)
+    # Etiquetado gramatical (POS tagging)
+    palabras_etiquetadas = pos_tag(palabras)
+    # Reconocimiento de entidades nombradas
+    arbol_entidades = ne_chunk(palabras_etiquetadas)
+    # Extraer entidades del árbol
+    entidades = []
+    for subtree in arbol_entidades:
+        if isinstance(subtree, nltk.Tree):
+            entidad = " ".join([word for word, tag in subtree.leaves()])
+            etiqueta = subtree.label()
+            entidades.append((entidad, etiqueta))
+    return entidades
+def extract_entity(text_list):
+	# A NER pipeline is set up, and entities from text_list are added to the entities list.
+	entities = []
+	for paragraph in tqdm(text_list):
+		entity = nlp_ner(paragraph)
+		entities.extend(entity)
+		# Delete duplicates
+		seen_words = set()
+		unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])]
+	return unique_entities
+@st.cache_data()
+def download_bert() :
+	model_bert_large_name  = "dbmdz/bert-large-cased-finetuned-conll03-english"
+	model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name )
+	tokenizer_bert_large   = BertTokenizer.from_pretrained(model_bert_large_name )
+	nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large)
+	return nlp_ner
+if __name__=='__main__':
+	text = scrapy()
+	st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)")
+	nlp_bert = download_bert()
+	st.write('BERT download')
+	progress_text = "Operation in progress. Please wait."
+	total_paragraphs = len(text)
+	my_bar = st.progress(0, text=progress_text)
+	entities_bert = []
+	for i, paragraph in enumerate(text):
+	    # Update the progress bar with the current progress
+	    percent_complete = (i + 1) / total_paragraphs
+	    my_bar.progress(percent_complete, text=progress_text)
+	    # Process the current paragraph using spaCy's BERT-based model
+	    entity = nlp_bert(paragraph)  # Assuming nlp_bert is correctly defined
+	    entities_bert.extend(entity)
+	# When the loop is complete, set the progress to 100%
+	my_bar.progress(1.0, text=progress_text)