Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import requests
|
4 |
+
import nltk
|
5 |
+
from nltk.tokenize import word_tokenize
|
6 |
+
from nltk.tag import pos_tag
|
7 |
+
from nltk.chunk import ne_chunk
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
11 |
+
import numpy as np
|
12 |
+
from transformers import BertTokenizer, BertForTokenClassification
|
13 |
+
from transformers import pipeline
|
14 |
+
# Descargar los recursos necesarios
|
15 |
+
nltk.download('punkt')
|
16 |
+
nltk.download('averaged_perceptron_tagger')
|
17 |
+
nltk.download('maxent_ne_chunker')
|
18 |
+
nltk.download('words')
|
19 |
+
# Descargar recursos necesarios
|
20 |
+
nltk.download('punkt')
|
21 |
+
nltk.download('stopwords')
|
22 |
+
import pandas as pd
|
23 |
+
from tqdm import tqdm
|
24 |
+
import spacy
|
25 |
+
|
26 |
+
# scrapy
|
27 |
+
def scrapy():
|
28 |
+
# Start a session
|
29 |
+
session = requests.Session()
|
30 |
+
|
31 |
+
# Define the headers
|
32 |
+
headers = {
|
33 |
+
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
|
34 |
+
"Accept-Encoding": "gzip, deflate, br",
|
35 |
+
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7",
|
36 |
+
"Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv",
|
37 |
+
"Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply",
|
38 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
|
39 |
+
}
|
40 |
+
|
41 |
+
# Update the session with the headers
|
42 |
+
session.headers.update(headers)
|
43 |
+
|
44 |
+
# Now you can make requests with this session and the headers will be used automatically
|
45 |
+
response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply")
|
46 |
+
|
47 |
+
# Parse the content with BeautifulSoup
|
48 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
49 |
+
|
50 |
+
# Initialize the variable that will contain the full content
|
51 |
+
paragraphs = []
|
52 |
+
|
53 |
+
# Extract the title and add it to the content
|
54 |
+
title = soup.find('h1')
|
55 |
+
if title:
|
56 |
+
paragraphs.append(title.get_text())
|
57 |
+
|
58 |
+
# Iterate through the article content and add each element in the order it appears
|
59 |
+
for element in soup.find_all(['h2', 'blockquote', 'p']):
|
60 |
+
paragraphs.append(element.get_text())
|
61 |
+
|
62 |
+
# print(paragraphs)
|
63 |
+
|
64 |
+
paragraphs = paragraphs[:-3]
|
65 |
+
|
66 |
+
|
67 |
+
return '\n'.join( paragraphs)
|
68 |
+
|
69 |
+
|
70 |
+
def reconocedor_de_entidades(texto):
|
71 |
+
# Tokenización
|
72 |
+
palabras = word_tokenize(texto)
|
73 |
+
|
74 |
+
# Etiquetado gramatical (POS tagging)
|
75 |
+
palabras_etiquetadas = pos_tag(palabras)
|
76 |
+
|
77 |
+
# Reconocimiento de entidades nombradas
|
78 |
+
arbol_entidades = ne_chunk(palabras_etiquetadas)
|
79 |
+
|
80 |
+
# Extraer entidades del árbol
|
81 |
+
entidades = []
|
82 |
+
for subtree in arbol_entidades:
|
83 |
+
if isinstance(subtree, nltk.Tree):
|
84 |
+
entidad = " ".join([word for word, tag in subtree.leaves()])
|
85 |
+
etiqueta = subtree.label()
|
86 |
+
entidades.append((entidad, etiqueta))
|
87 |
+
|
88 |
+
return entidades
|
89 |
+
|
90 |
+
|
91 |
+
def extract_entity(text_list):
|
92 |
+
# A NER pipeline is set up, and entities from text_list are added to the entities list.
|
93 |
+
entities = []
|
94 |
+
for paragraph in tqdm(text_list):
|
95 |
+
entity = nlp_ner(paragraph)
|
96 |
+
entities.extend(entity)
|
97 |
+
|
98 |
+
# Delete duplicates
|
99 |
+
seen_words = set()
|
100 |
+
unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])]
|
101 |
+
|
102 |
+
return unique_entities
|
103 |
+
|
104 |
+
|
105 |
+
@st.cache_data()
|
106 |
+
def download_bert() :
|
107 |
+
model_bert_large_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
|
108 |
+
|
109 |
+
model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name )
|
110 |
+
tokenizer_bert_large = BertTokenizer.from_pretrained(model_bert_large_name )
|
111 |
+
nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large)
|
112 |
+
|
113 |
+
return nlp_ner
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
if __name__=='__main__':
|
119 |
+
|
120 |
+
text = scrapy()
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)")
|
125 |
+
|
126 |
+
nlp_bert = download_bert()
|
127 |
+
|
128 |
+
st.write('BERT download')
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
progress_text = "Operation in progress. Please wait."
|
133 |
+
total_paragraphs = len(text)
|
134 |
+
my_bar = st.progress(0, text=progress_text)
|
135 |
+
|
136 |
+
entities_bert = []
|
137 |
+
|
138 |
+
for i, paragraph in enumerate(text):
|
139 |
+
# Update the progress bar with the current progress
|
140 |
+
percent_complete = (i + 1) / total_paragraphs
|
141 |
+
my_bar.progress(percent_complete, text=progress_text)
|
142 |
+
|
143 |
+
# Process the current paragraph using spaCy's BERT-based model
|
144 |
+
entity = nlp_bert(paragraph) # Assuming nlp_bert is correctly defined
|
145 |
+
entities_bert.extend(entity)
|
146 |
+
|
147 |
+
# When the loop is complete, set the progress to 100%
|
148 |
+
my_bar.progress(1.0, text=progress_text)
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
|