blasblues commited on
Commit
a9d9e0a
1 Parent(s): de4c092

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ import nltk
5
+ from nltk.tokenize import word_tokenize
6
+ from nltk.tag import pos_tag
7
+ from nltk.chunk import ne_chunk
8
+ from nltk.corpus import stopwords
9
+ from nltk.tokenize import word_tokenize, sent_tokenize
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ import numpy as np
12
+ from transformers import BertTokenizer, BertForTokenClassification
13
+ from transformers import pipeline
14
+ # Descargar los recursos necesarios
15
+ nltk.download('punkt')
16
+ nltk.download('averaged_perceptron_tagger')
17
+ nltk.download('maxent_ne_chunker')
18
+ nltk.download('words')
19
+ # Descargar recursos necesarios
20
+ nltk.download('punkt')
21
+ nltk.download('stopwords')
22
+ import pandas as pd
23
+ from tqdm import tqdm
24
+ import spacy
25
+
26
+ # scrapy
27
+ def scrapy():
28
+ # Start a session
29
+ session = requests.Session()
30
+
31
+ # Define the headers
32
+ headers = {
33
+ "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
34
+ "Accept-Encoding": "gzip, deflate, br",
35
+ "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7",
36
+ "Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv",
37
+ "Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply",
38
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
39
+ }
40
+
41
+ # Update the session with the headers
42
+ session.headers.update(headers)
43
+
44
+ # Now you can make requests with this session and the headers will be used automatically
45
+ response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply")
46
+
47
+ # Parse the content with BeautifulSoup
48
+ soup = BeautifulSoup(response.content, 'html.parser')
49
+
50
+ # Initialize the variable that will contain the full content
51
+ paragraphs = []
52
+
53
+ # Extract the title and add it to the content
54
+ title = soup.find('h1')
55
+ if title:
56
+ paragraphs.append(title.get_text())
57
+
58
+ # Iterate through the article content and add each element in the order it appears
59
+ for element in soup.find_all(['h2', 'blockquote', 'p']):
60
+ paragraphs.append(element.get_text())
61
+
62
+ # print(paragraphs)
63
+
64
+ paragraphs = paragraphs[:-3]
65
+
66
+
67
+ return '\n'.join( paragraphs)
68
+
69
+
70
+ def reconocedor_de_entidades(texto):
71
+ # Tokenización
72
+ palabras = word_tokenize(texto)
73
+
74
+ # Etiquetado gramatical (POS tagging)
75
+ palabras_etiquetadas = pos_tag(palabras)
76
+
77
+ # Reconocimiento de entidades nombradas
78
+ arbol_entidades = ne_chunk(palabras_etiquetadas)
79
+
80
+ # Extraer entidades del árbol
81
+ entidades = []
82
+ for subtree in arbol_entidades:
83
+ if isinstance(subtree, nltk.Tree):
84
+ entidad = " ".join([word for word, tag in subtree.leaves()])
85
+ etiqueta = subtree.label()
86
+ entidades.append((entidad, etiqueta))
87
+
88
+ return entidades
89
+
90
+
91
+ def extract_entity(text_list):
92
+ # A NER pipeline is set up, and entities from text_list are added to the entities list.
93
+ entities = []
94
+ for paragraph in tqdm(text_list):
95
+ entity = nlp_ner(paragraph)
96
+ entities.extend(entity)
97
+
98
+ # Delete duplicates
99
+ seen_words = set()
100
+ unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])]
101
+
102
+ return unique_entities
103
+
104
+
105
+ @st.cache_data()
106
+ def download_bert() :
107
+ model_bert_large_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
108
+
109
+ model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name )
110
+ tokenizer_bert_large = BertTokenizer.from_pretrained(model_bert_large_name )
111
+ nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large)
112
+
113
+ return nlp_ner
114
+
115
+
116
+
117
+
118
+ if __name__=='__main__':
119
+
120
+ text = scrapy()
121
+
122
+
123
+
124
+ st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)")
125
+
126
+ nlp_bert = download_bert()
127
+
128
+ st.write('BERT download')
129
+
130
+
131
+
132
+ progress_text = "Operation in progress. Please wait."
133
+ total_paragraphs = len(text)
134
+ my_bar = st.progress(0, text=progress_text)
135
+
136
+ entities_bert = []
137
+
138
+ for i, paragraph in enumerate(text):
139
+ # Update the progress bar with the current progress
140
+ percent_complete = (i + 1) / total_paragraphs
141
+ my_bar.progress(percent_complete, text=progress_text)
142
+
143
+ # Process the current paragraph using spaCy's BERT-based model
144
+ entity = nlp_bert(paragraph) # Assuming nlp_bert is correctly defined
145
+ entities_bert.extend(entity)
146
+
147
+ # When the loop is complete, set the progress to 100%
148
+ my_bar.progress(1.0, text=progress_text)
149
+
150
+
151
+
152
+
153
+
154
+