Spaces:
Runtime error
Runtime error
File size: 5,143 Bytes
a9d9e0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import streamlit as st
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline
# Descargar los recursos necesarios
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# Descargar recursos necesarios
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from tqdm import tqdm
import spacy
# scrapy
def scrapy():
# Start a session
session = requests.Session()
# Define the headers
headers = {
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7",
"Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv",
"Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
# Update the session with the headers
session.headers.update(headers)
# Now you can make requests with this session and the headers will be used automatically
response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply")
# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Initialize the variable that will contain the full content
paragraphs = []
# Extract the title and add it to the content
title = soup.find('h1')
if title:
paragraphs.append(title.get_text())
# Iterate through the article content and add each element in the order it appears
for element in soup.find_all(['h2', 'blockquote', 'p']):
paragraphs.append(element.get_text())
# print(paragraphs)
paragraphs = paragraphs[:-3]
return '\n'.join( paragraphs)
def reconocedor_de_entidades(texto):
# Tokenización
palabras = word_tokenize(texto)
# Etiquetado gramatical (POS tagging)
palabras_etiquetadas = pos_tag(palabras)
# Reconocimiento de entidades nombradas
arbol_entidades = ne_chunk(palabras_etiquetadas)
# Extraer entidades del árbol
entidades = []
for subtree in arbol_entidades:
if isinstance(subtree, nltk.Tree):
entidad = " ".join([word for word, tag in subtree.leaves()])
etiqueta = subtree.label()
entidades.append((entidad, etiqueta))
return entidades
def extract_entity(text_list):
# A NER pipeline is set up, and entities from text_list are added to the entities list.
entities = []
for paragraph in tqdm(text_list):
entity = nlp_ner(paragraph)
entities.extend(entity)
# Delete duplicates
seen_words = set()
unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])]
return unique_entities
@st.cache_data()
def download_bert() :
model_bert_large_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name )
tokenizer_bert_large = BertTokenizer.from_pretrained(model_bert_large_name )
nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large)
return nlp_ner
if __name__=='__main__':
text = scrapy()
st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)")
nlp_bert = download_bert()
st.write('BERT download')
progress_text = "Operation in progress. Please wait."
total_paragraphs = len(text)
my_bar = st.progress(0, text=progress_text)
entities_bert = []
for i, paragraph in enumerate(text):
# Update the progress bar with the current progress
percent_complete = (i + 1) / total_paragraphs
my_bar.progress(percent_complete, text=progress_text)
# Process the current paragraph using spaCy's BERT-based model
entity = nlp_bert(paragraph) # Assuming nlp_bert is correctly defined
entities_bert.extend(entity)
# When the loop is complete, set the progress to 100%
my_bar.progress(1.0, text=progress_text)
|