File size: 5,143 Bytes
a9d9e0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import streamlit as st
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline 
# Descargar los recursos necesarios
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# Descargar recursos necesarios
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from tqdm import tqdm
import spacy

# scrapy
def scrapy():
	# Start a session
	session = requests.Session()

	# Define the headers
	headers = {
	    "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
	    "Accept-Encoding": "gzip, deflate, br",
	    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,es;q=0.7",
	    "Cookie": "GU_mvt_id=546190; bwid=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; bwid_withoutSameSiteForIncompatibleClients=idFromPV_tGNE1Y4ziW6RF9ZU7oKWAQ; consentUUID=7e54c557-2b08-429f-9b3a-62d04932b3aa_22; consentDate=2023-08-15T12:41:50.817Z; _ga=GA1.2.1086360360.1692103312; _gid=GA1.2.362089074.1692103312; permutive-id=e6896ed3-6a89-426c-bced-1b3e2f395993; _cc_id=6b76286e9308ea51392d6993ac96cd0b; panoramaId_expiry=1692708112890; panoramaId=8b4cbd9cd4e1289b855afdf3abb74945a7027a222951e8665464c8751b3a5aeb; panoramaIdType=panoIndiv",
	    "Referer": "https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply",
	    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
	}

	# Update the session with the headers
	session.headers.update(headers)

	# Now you can make requests with this session and the headers will be used automatically
	response = session.get("https://www.theguardian.com/books/2022/nov/05/i-want-to-open-a-window-in-their-souls-haruki-murakami-on-the-power-of-writing-simply")

	# Parse the content with BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Initialize the variable that will contain the full content
	paragraphs = []

	# Extract the title and add it to the content
	title = soup.find('h1')
	if title:
	    paragraphs.append(title.get_text())

	# Iterate through the article content and add each element in the order it appears
	for element in soup.find_all(['h2', 'blockquote', 'p']):
	    paragraphs.append(element.get_text())

	# print(paragraphs)

	paragraphs = paragraphs[:-3]


	return '\n'.join( paragraphs)


def reconocedor_de_entidades(texto):
    # Tokenización
    palabras = word_tokenize(texto)

    # Etiquetado gramatical (POS tagging)
    palabras_etiquetadas = pos_tag(palabras)

    # Reconocimiento de entidades nombradas
    arbol_entidades = ne_chunk(palabras_etiquetadas)

    # Extraer entidades del árbol
    entidades = []
    for subtree in arbol_entidades:
        if isinstance(subtree, nltk.Tree):
            entidad = " ".join([word for word, tag in subtree.leaves()])
            etiqueta = subtree.label()
            entidades.append((entidad, etiqueta))

    return entidades


def extract_entity(text_list):
	# A NER pipeline is set up, and entities from text_list are added to the entities list.
	entities = []
	for paragraph in tqdm(text_list):
		entity = nlp_ner(paragraph)
		entities.extend(entity)

		# Delete duplicates
		seen_words = set()
		unique_entities = [entry for entry in entities if entry['word'] not in seen_words and not seen_words.add(entry['word'])]

	return unique_entities


@st.cache_data()
def download_bert() :
	model_bert_large_name  = "dbmdz/bert-large-cased-finetuned-conll03-english"

	model_model_bert_large = BertForTokenClassification.from_pretrained(model_bert_large_name )
	tokenizer_bert_large   = BertTokenizer.from_pretrained(model_bert_large_name )
	nlp_ner = pipeline("ner", model=model_model_bert_large, aggregation_strategy="simple", tokenizer = tokenizer_bert_large)

	return nlp_ner




if __name__=='__main__':

	text = scrapy()


	
	st.markdown(" [Analysis with Hugging Face in this link (COLAB)](https://colab.research.google.com/drive/1J6R20SSRdx9y8GMyiayYlaMnrQVBOvaa#scrollTo=RviFJwTTVid7)")

	nlp_bert = download_bert()

	st.write('BERT download')



	progress_text = "Operation in progress. Please wait."
	total_paragraphs = len(text)
	my_bar = st.progress(0, text=progress_text)

	entities_bert = []

	for i, paragraph in enumerate(text):
	    # Update the progress bar with the current progress
	    percent_complete = (i + 1) / total_paragraphs
	    my_bar.progress(percent_complete, text=progress_text)
	    
	    # Process the current paragraph using spaCy's BERT-based model
	    entity = nlp_bert(paragraph)  # Assuming nlp_bert is correctly defined
	    entities_bert.extend(entity)

	# When the loop is complete, set the progress to 100%
	my_bar.progress(1.0, text=progress_text)