Upload 10 files
Browse files- Prove_lite.py +271 -0
- Prove_llm.py +84 -0
- SimpleUI_lite.py +122 -0
- SimpleUI_llm.py +136 -0
- UI_tester.py +52 -0
- Wikidata_Text_Parser.py +929 -0
- app.py +122 -0
- llm_load copy.py +188 -0
- llm_load.py +188 -0
- requirements.txt +118 -0
Prove_lite.py
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import sqlite3, torch, json, re, os, torch, itertools, nltk
|
4 |
+
from ast import literal_eval as leval
|
5 |
+
from tqdm.auto import tqdm
|
6 |
+
from utils.verbalisation_module import VerbModule
|
7 |
+
from utils.sentence_retrieval_module import SentenceRetrievalModule
|
8 |
+
from utils.textual_entailment_module import TextualEntailmentModule
|
9 |
+
from importlib import reload
|
10 |
+
from html.parser import HTMLParser
|
11 |
+
from sentence_transformers import SentenceTransformer
|
12 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
+
from tqdm import tqdm
|
14 |
+
import gradio as gr
|
15 |
+
from bs4 import BeautifulSoup
|
16 |
+
from cleantext import clean
|
17 |
+
|
18 |
+
|
19 |
+
def verbalisation(claim_df):
|
20 |
+
verb_module = VerbModule()
|
21 |
+
triples = []
|
22 |
+
for _, row in claim_df.iterrows():
|
23 |
+
triple = {
|
24 |
+
'subject': row['entity_label'],
|
25 |
+
'predicate': row['property_label'],
|
26 |
+
'object': row['object_label']
|
27 |
+
}
|
28 |
+
triples.append(triple)
|
29 |
+
|
30 |
+
|
31 |
+
claim_df['verbalisation'] = verb_module.verbalise_triples(triples)
|
32 |
+
claim_df['verbalisation_unks_replaced'] = claim_df['verbalisation'].apply(verb_module.replace_unks_on_sentence)
|
33 |
+
claim_df['verbalisation_unks_replaced_then_dropped'] = claim_df['verbalisation'].apply(lambda x: verb_module.replace_unks_on_sentence(x, empty_after=True))
|
34 |
+
return claim_df
|
35 |
+
|
36 |
+
def setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress):
|
37 |
+
join_df = pd.merge(verbalised_claims_df_final, reference_text_df[['reference_id', 'url', 'html']], on='reference_id', how='left')
|
38 |
+
SS_df = join_df[['reference_id','url','verbalisation', 'html']].copy()
|
39 |
+
def clean_html(html_content):
|
40 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
41 |
+
text = soup.get_text(separator=' ', strip=True)
|
42 |
+
cleaned_text = clean(text,
|
43 |
+
fix_unicode=True,
|
44 |
+
to_ascii=True,
|
45 |
+
lower=False,
|
46 |
+
no_line_breaks=False,
|
47 |
+
no_urls=True,
|
48 |
+
no_emails=True,
|
49 |
+
no_phone_numbers=True,
|
50 |
+
no_numbers=False,
|
51 |
+
no_digits=False,
|
52 |
+
no_currency_symbols=True,
|
53 |
+
no_punct=False,
|
54 |
+
replace_with_url="",
|
55 |
+
replace_with_email="",
|
56 |
+
replace_with_phone_number="",
|
57 |
+
replace_with_number="",
|
58 |
+
replace_with_digit="",
|
59 |
+
replace_with_currency_symbol="")
|
60 |
+
return cleaned_text
|
61 |
+
def split_into_sentences(text):
|
62 |
+
sentences = nltk.sent_tokenize(text)
|
63 |
+
return sentences
|
64 |
+
def slide_sentences(sentences, window_size=2):
|
65 |
+
if len(sentences) < window_size:
|
66 |
+
return [" ".join(sentences)]
|
67 |
+
return [" ".join(sentences[i:i + window_size]) for i in range(len(sentences) - window_size + 1)]
|
68 |
+
|
69 |
+
SS_df['html2text'] = SS_df['html'].apply(clean_html)
|
70 |
+
SS_df['nlp_sentences'] = SS_df['html2text'].apply(split_into_sentences)
|
71 |
+
SS_df['nlp_sentences_slide_2'] = SS_df['nlp_sentences'].apply(slide_sentences)
|
72 |
+
|
73 |
+
return SS_df[['reference_id','verbalisation','url','nlp_sentences','nlp_sentences_slide_2']]
|
74 |
+
|
75 |
+
def evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES):
|
76 |
+
sr_module = SentenceRetrievalModule(max_len=512)
|
77 |
+
sentence_relevance_df = splited_sentences_from_html.copy()
|
78 |
+
sentence_relevance_df.rename(columns={'verbalisation': 'final_verbalisation'}, inplace=True)
|
79 |
+
|
80 |
+
def chunks(l, n):
|
81 |
+
n = max(1, n)
|
82 |
+
return [l[i:i + n] for i in range(0, len(l), n)]
|
83 |
+
|
84 |
+
def compute_scores(column_name):
|
85 |
+
all_outputs = []
|
86 |
+
for _, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):
|
87 |
+
outputs = []
|
88 |
+
for batch in chunks(row[column_name], BATCH_SIZE):
|
89 |
+
batch_outputs = sr_module.score_sentence_pairs([(row['final_verbalisation'], sentence) for sentence in batch])
|
90 |
+
outputs += batch_outputs
|
91 |
+
all_outputs.append(outputs)
|
92 |
+
sentence_relevance_df[f'{column_name}_scores'] = pd.Series(all_outputs)
|
93 |
+
assert all(sentence_relevance_df.apply(lambda x: len(x[column_name]) == len(x[f'{column_name}_scores']), axis=1))
|
94 |
+
|
95 |
+
compute_scores('nlp_sentences')
|
96 |
+
compute_scores('nlp_sentences_slide_2')
|
97 |
+
|
98 |
+
def get_top_n_sentences(row, column_name, n):
|
99 |
+
sentences_with_scores = [{'sentence': t[0], 'score': t[1], 'sentence_id': f"{row.name}_{j}"} for j, t in enumerate(zip(row[column_name], row[f'{column_name}_scores']))]
|
100 |
+
return sorted(sentences_with_scores, key=lambda x: x['score'], reverse=True)[:n]
|
101 |
+
|
102 |
+
|
103 |
+
def filter_overlaps(sentences):
|
104 |
+
filtered = []
|
105 |
+
for evidence in sentences:
|
106 |
+
if ';' in evidence['sentence_id']:
|
107 |
+
start_id, end_id = evidence['sentence_id'].split(';')
|
108 |
+
if not any(start_id in e['sentence_id'].split(';') or end_id in e['sentence_id'].split(';') for e in filtered):
|
109 |
+
filtered.append(evidence)
|
110 |
+
else:
|
111 |
+
if not any(evidence['sentence_id'] in e['sentence_id'].split(';') for e in filtered):
|
112 |
+
filtered.append(evidence)
|
113 |
+
return filtered
|
114 |
+
|
115 |
+
def limit_sentence_length(sentence, max_length):
|
116 |
+
if len(sentence) > max_length:
|
117 |
+
return sentence[:max_length] + '...'
|
118 |
+
return sentence
|
119 |
+
|
120 |
+
nlp_sentences_TOP_N, nlp_sentences_slide_2_TOP_N, nlp_sentences_all_TOP_N = [], [], []
|
121 |
+
|
122 |
+
for _, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):
|
123 |
+
top_n = get_top_n_sentences(row, 'nlp_sentences', N_TOP_SENTENCES)
|
124 |
+
top_n = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in top_n]
|
125 |
+
nlp_sentences_TOP_N.append(top_n)
|
126 |
+
|
127 |
+
top_n_slide_2 = get_top_n_sentences(row, 'nlp_sentences_slide_2', N_TOP_SENTENCES)
|
128 |
+
top_n_slide_2 = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in top_n_slide_2]
|
129 |
+
nlp_sentences_slide_2_TOP_N.append(top_n_slide_2)
|
130 |
+
|
131 |
+
all_sentences = top_n + top_n_slide_2
|
132 |
+
all_sentences_sorted = sorted(all_sentences, key=lambda x: x['score'], reverse=True)
|
133 |
+
filtered_sentences = filter_overlaps(all_sentences_sorted)
|
134 |
+
filtered_sentences = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in filtered_sentences]
|
135 |
+
nlp_sentences_all_TOP_N.append(filtered_sentences[:N_TOP_SENTENCES])
|
136 |
+
|
137 |
+
sentence_relevance_df['nlp_sentences_TOP_N'] = pd.Series(nlp_sentences_TOP_N)
|
138 |
+
sentence_relevance_df['nlp_sentences_slide_2_TOP_N'] = pd.Series(nlp_sentences_slide_2_TOP_N)
|
139 |
+
sentence_relevance_df['nlp_sentences_all_TOP_N'] = pd.Series(nlp_sentences_all_TOP_N)
|
140 |
+
|
141 |
+
return sentence_relevance_df
|
142 |
+
|
143 |
+
def textEntailment(evidence_df, SCORE_THRESHOLD):
|
144 |
+
textual_entailment_df = evidence_df.copy()
|
145 |
+
te_module = TextualEntailmentModule()
|
146 |
+
|
147 |
+
keys = ['TOP_N', 'slide_2_TOP_N', 'all_TOP_N']
|
148 |
+
te_columns = {f'evidence_TE_prob_{key}': [] for key in keys}
|
149 |
+
te_columns.update({f'evidence_TE_prob_weighted_{key}': [] for key in keys})
|
150 |
+
te_columns.update({f'evidence_TE_labels_{key}': [] for key in keys})
|
151 |
+
te_columns.update({f'claim_TE_prob_weighted_sum_{key}': [] for key in keys})
|
152 |
+
te_columns.update({f'claim_TE_label_weighted_sum_{key}': [] for key in keys})
|
153 |
+
te_columns.update({f'claim_TE_label_malon_{key}': [] for key in keys})
|
154 |
+
|
155 |
+
def process_row(row):
|
156 |
+
claim = row['final_verbalisation']
|
157 |
+
results = {}
|
158 |
+
for key in keys:
|
159 |
+
evidence = row[f'nlp_sentences_{key}']
|
160 |
+
evidence_size = len(evidence)
|
161 |
+
if evidence_size == 0:
|
162 |
+
results[key] = {
|
163 |
+
'evidence_TE_prob': [],
|
164 |
+
'evidence_TE_labels': [],
|
165 |
+
'evidence_TE_prob_weighted': [],
|
166 |
+
'claim_TE_prob_weighted_sum': [0, 0, 0],
|
167 |
+
'claim_TE_label_weighted_sum': 'NOT ENOUGH INFO',
|
168 |
+
'claim_TE_label_malon': 'NOT ENOUGH INFO'
|
169 |
+
}
|
170 |
+
continue
|
171 |
+
|
172 |
+
evidence_TE_prob = te_module.get_batch_scores(
|
173 |
+
claims=[claim] * evidence_size,
|
174 |
+
evidence=[e['sentence'] for e in evidence]
|
175 |
+
)
|
176 |
+
|
177 |
+
evidence_TE_labels = [te_module.get_label_from_scores(s) for s in evidence_TE_prob]
|
178 |
+
|
179 |
+
evidence_TE_prob_weighted = [
|
180 |
+
probs * ev['score'] for probs, ev in zip(evidence_TE_prob, evidence)
|
181 |
+
if ev['score'] > SCORE_THRESHOLD
|
182 |
+
]
|
183 |
+
|
184 |
+
claim_TE_prob_weighted_sum = np.sum(evidence_TE_prob_weighted, axis=0) if evidence_TE_prob_weighted else [0, 0, 0]
|
185 |
+
|
186 |
+
claim_TE_label_weighted_sum = te_module.get_label_from_scores(claim_TE_prob_weighted_sum) if evidence_TE_prob_weighted else 'NOT ENOUGH INFO'
|
187 |
+
|
188 |
+
claim_TE_label_malon = te_module.get_label_malon(
|
189 |
+
[probs for probs, ev in zip(evidence_TE_prob, evidence) if ev['score'] > SCORE_THRESHOLD]
|
190 |
+
)
|
191 |
+
|
192 |
+
results[key] = {
|
193 |
+
'evidence_TE_prob': evidence_TE_prob,
|
194 |
+
'evidence_TE_labels': evidence_TE_labels,
|
195 |
+
'evidence_TE_prob_weighted': evidence_TE_prob_weighted,
|
196 |
+
'claim_TE_prob_weighted_sum': claim_TE_prob_weighted_sum,
|
197 |
+
'claim_TE_label_weighted_sum': claim_TE_label_weighted_sum,
|
198 |
+
'claim_TE_label_malon': claim_TE_label_malon
|
199 |
+
}
|
200 |
+
return results
|
201 |
+
|
202 |
+
for i, row in tqdm(textual_entailment_df.iterrows(), total=textual_entailment_df.shape[0]):
|
203 |
+
try:
|
204 |
+
result_sets = process_row(row)
|
205 |
+
for key in keys:
|
206 |
+
for k, v in result_sets[key].items():
|
207 |
+
te_columns[f'{k}_{key}'].append(v)
|
208 |
+
except Exception as e:
|
209 |
+
print(f"Error processing row {i}: {e}")
|
210 |
+
print(row)
|
211 |
+
raise
|
212 |
+
|
213 |
+
for key in keys:
|
214 |
+
for col in ['evidence_TE_prob', 'evidence_TE_prob_weighted', 'evidence_TE_labels',
|
215 |
+
'claim_TE_prob_weighted_sum', 'claim_TE_label_weighted_sum', 'claim_TE_label_malon']:
|
216 |
+
textual_entailment_df[f'{col}_{key}'] = pd.Series(te_columns[f'{col}_{key}'])
|
217 |
+
|
218 |
+
return textual_entailment_df
|
219 |
+
|
220 |
+
def TableMaking(verbalised_claims_df_final, result):
|
221 |
+
verbalised_claims_df_final.set_index('reference_id', inplace=True)
|
222 |
+
result.set_index('reference_id', inplace=True)
|
223 |
+
results = pd.concat([verbalised_claims_df_final, result], axis=1)
|
224 |
+
results['triple'] = results[['entity_label', 'property_label', 'object_label']].apply(lambda x: ', '.join(x), axis=1)
|
225 |
+
all_result = pd.DataFrame()
|
226 |
+
for idx, row in results.iterrows():
|
227 |
+
aResult = pd.DataFrame(row["nlp_sentences_TOP_N"])[['sentence','score']]
|
228 |
+
aResult.rename(columns={'score': 'Relevance_score'}, inplace=True)
|
229 |
+
aResult = pd.concat([aResult, pd.DataFrame(row["evidence_TE_labels_all_TOP_N"], columns=['TextEntailment'])], axis=1)
|
230 |
+
aResult = pd.concat([aResult, pd.DataFrame(np.max(row["evidence_TE_prob_all_TOP_N"], axis=1), columns=['Entailment_score'])], axis=1)
|
231 |
+
aResult = aResult.reindex(columns=['sentence', 'TextEntailment', 'Entailment_score','Relevance_score'])
|
232 |
+
aBox = pd.DataFrame({'triple': [row["triple"]], 'url': row['url'],'Results': [aResult]})
|
233 |
+
all_result = pd.concat([all_result,aBox], axis=0)
|
234 |
+
|
235 |
+
def dataframe_to_html(all_result):
|
236 |
+
html = '<html><head><style>table {border-collapse: collapse; width: 100%;} th, td {border: 1px solid black; padding: 8px; text-align: left;} th {background-color: #f2f2f2;}</style></head><body>'
|
237 |
+
for triple in all_result['triple'].unique():
|
238 |
+
html += f'<h3>Triple: {triple}</h3>'
|
239 |
+
df = all_result[all_result['triple']==triple].copy()
|
240 |
+
for idx, row in df.iterrows():
|
241 |
+
url = row['url']
|
242 |
+
results = row['Results']
|
243 |
+
html += f'<h3>Reference: {url}</h3>'
|
244 |
+
html += results.to_html(index=False)
|
245 |
+
html += '</body></html>'
|
246 |
+
return html
|
247 |
+
html_result = dataframe_to_html(all_result)
|
248 |
+
return html_result
|
249 |
+
|
250 |
+
if __name__ == '__main__':
|
251 |
+
target_QID = 'Q245247'
|
252 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
253 |
+
query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
|
254 |
+
claim_df = pd.read_sql_query(query, conn)
|
255 |
+
query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
|
256 |
+
reference_text_df = pd.read_sql_query(query, conn)
|
257 |
+
verbalised_claims_df_final = verbalisation(claim_df)
|
258 |
+
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
|
259 |
+
def update_progress(curr_step, total_steps):
|
260 |
+
progress((curr_step + 1) / total_steps)
|
261 |
+
|
262 |
+
splited_sentences_from_html = setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
|
263 |
+
|
264 |
+
BATCH_SIZE = 512
|
265 |
+
N_TOP_SENTENCES = 5
|
266 |
+
SCORE_THRESHOLD = 0.6
|
267 |
+
evidence_df = evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
|
268 |
+
result = textEntailment(evidence_df, SCORE_THRESHOLD)
|
269 |
+
conn.commit()
|
270 |
+
conn.close()
|
271 |
+
display_df =TableMaking(verbalised_claims_df_final, result)
|
Prove_llm.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import sqlite3, torch, json, re, os, torch, itertools, html2text
|
4 |
+
from ast import literal_eval as leval
|
5 |
+
from tqdm.auto import tqdm
|
6 |
+
from utils.verbalisation_module import VerbModule
|
7 |
+
from utils.sentence_retrieval_module import SentenceRetrievalModule
|
8 |
+
from utils.textual_entailment_module import TextualEntailmentModule
|
9 |
+
from importlib import reload
|
10 |
+
import llm_load
|
11 |
+
from html.parser import HTMLParser
|
12 |
+
from sentence_transformers import SentenceTransformer
|
13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
14 |
+
from tqdm import tqdm
|
15 |
+
import gradio as gr
|
16 |
+
|
17 |
+
|
18 |
+
def verbalisation(claim_df):
|
19 |
+
verb_module = VerbModule()
|
20 |
+
triples = []
|
21 |
+
for _, row in claim_df.iterrows():
|
22 |
+
triple = {
|
23 |
+
'subject': row['entity_label'],
|
24 |
+
'predicate': row['property_label'],
|
25 |
+
'object': row['object_label']
|
26 |
+
}
|
27 |
+
triples.append(triple)
|
28 |
+
|
29 |
+
|
30 |
+
claim_df['verbalisation'] = verb_module.verbalise_triples(triples)
|
31 |
+
claim_df['verbalisation_unks_replaced'] = claim_df['verbalisation'].apply(verb_module.replace_unks_on_sentence)
|
32 |
+
claim_df['verbalisation_unks_replaced_then_dropped'] = claim_df['verbalisation'].apply(lambda x: verb_module.replace_unks_on_sentence(x, empty_after=True))
|
33 |
+
return claim_df
|
34 |
+
|
35 |
+
def RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress):
|
36 |
+
join_df = pd.merge(verbalised_claims_df_final, reference_text_df[['reference_id', 'url', 'html']], on='reference_id', how='left')
|
37 |
+
tokenizer, model = llm_load.llmLoad(4096)
|
38 |
+
h = html2text.HTML2Text()
|
39 |
+
h.ignore_links = True
|
40 |
+
|
41 |
+
filtered_htmls = []
|
42 |
+
answers = []
|
43 |
+
verifications = []
|
44 |
+
for idx, (html, verb) in enumerate(zip(join_df['html'], join_df['verbalisation'])):
|
45 |
+
try:
|
46 |
+
filtered_html = h.handle(html)
|
47 |
+
filtered_htmls.append(filtered_html)
|
48 |
+
instruct = "Find the most relevant sentences from the filtered HTML document based on the given target sentence. If there are no directly related sentences, try to find sentences that provide context or background information related to the target sentence. Only answer 'nothing' if there is absolutely no relevant information in the document. Do not include any HTML tags or markup in your answer."
|
49 |
+
question = f"target sentence:'{verb}', filtered HTML document:{filtered_html}"
|
50 |
+
answer = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=128)
|
51 |
+
answers.append(answer)
|
52 |
+
except:
|
53 |
+
answers.append('Malformed html')
|
54 |
+
instruct = "Determine whether the target sentence is supported by the given evidence or not. If so, answer 'supportive'. It not, answer 'No supports'. Or, you can't determine with the given evidence, then asnwer 'Not enough information'"
|
55 |
+
question = f"target sentence:'{verb}', evidence:{answers[-1]}"
|
56 |
+
verification = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=64)
|
57 |
+
verifications.append(verification)
|
58 |
+
|
59 |
+
update_progress(idx, len(join_df)) # Update progress
|
60 |
+
|
61 |
+
|
62 |
+
return pd.DataFrame({'verbalisation': join_df['verbalisation'], 'verification': verifications, 'evidence_set': answers, 'url': join_df['url'], 'filtered_html': filtered_htmls})
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
if __name__ == '__main__':
|
67 |
+
target_QID = 'Q42'
|
68 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
69 |
+
query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
|
70 |
+
claim_df = pd.read_sql_query(query, conn)
|
71 |
+
|
72 |
+
query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
|
73 |
+
reference_text_df = pd.read_sql_query(query, conn)
|
74 |
+
|
75 |
+
verbalised_claims_df_final = verbalisation(claim_df)
|
76 |
+
|
77 |
+
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
|
78 |
+
def update_progress(curr_step, total_steps):
|
79 |
+
progress((curr_step + 1) / total_steps)
|
80 |
+
|
81 |
+
result = RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
|
82 |
+
|
83 |
+
conn.commit()
|
84 |
+
conn.close()
|
SimpleUI_lite.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import Wikidata_Text_Parser as wtr
|
3 |
+
import sqlite3
|
4 |
+
import Prove_lite as prv
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
import os
|
8 |
+
|
9 |
+
def wtr_process(qid):
|
10 |
+
try:
|
11 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
12 |
+
target_QID = qid
|
13 |
+
query = f"SELECT * FROM {'claim_text'}"
|
14 |
+
df = pd.read_sql_query(query, conn)
|
15 |
+
if target_QID in df['entity_id'].unique():
|
16 |
+
pass
|
17 |
+
else:
|
18 |
+
wtr.claimParser(target_QID) #save results in .db
|
19 |
+
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
|
20 |
+
url_set = wtr.urlParser(target_QID) #from ref table in .db
|
21 |
+
html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
|
22 |
+
claim_text = wtr.claim2text(html_set) #Claims generation
|
23 |
+
html_text = wtr.html2text(html_set)
|
24 |
+
claim_text = claim_text.astype(str)
|
25 |
+
html_text = html_text.astype(str)
|
26 |
+
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
|
27 |
+
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
|
28 |
+
conn.commit()
|
29 |
+
query = f"""
|
30 |
+
SELECT
|
31 |
+
claim_text.entity_label,
|
32 |
+
claim_text.property_label,
|
33 |
+
claim_text.object_label,
|
34 |
+
html_text.url
|
35 |
+
FROM claim_text
|
36 |
+
INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
|
37 |
+
WHERE claim_text.entity_id = '{target_QID}'
|
38 |
+
"""
|
39 |
+
|
40 |
+
result_df = pd.read_sql_query(query, conn)
|
41 |
+
|
42 |
+
conn.commit()
|
43 |
+
conn.close()
|
44 |
+
|
45 |
+
return result_df
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
error_df = pd.DataFrame({'Error': [str(e)]})
|
49 |
+
return error_df
|
50 |
+
|
51 |
+
|
52 |
+
def prv_process(qid):
|
53 |
+
target_QID = qid
|
54 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
55 |
+
query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
|
56 |
+
claim_df = pd.read_sql_query(query, conn)
|
57 |
+
|
58 |
+
query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
|
59 |
+
reference_text_df = pd.read_sql_query(query, conn)
|
60 |
+
|
61 |
+
verbalised_claims_df_final = prv.verbalisation(claim_df)
|
62 |
+
|
63 |
+
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
|
64 |
+
def update_progress(curr_step, total_steps):
|
65 |
+
progress((curr_step + 1) / total_steps)
|
66 |
+
|
67 |
+
splited_sentences_from_html = prv.setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
|
68 |
+
|
69 |
+
BATCH_SIZE = 512
|
70 |
+
N_TOP_SENTENCES = 5
|
71 |
+
SCORE_THRESHOLD = 0
|
72 |
+
evidence_df = prv.evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
|
73 |
+
result = prv.textEntailment(evidence_df, SCORE_THRESHOLD)
|
74 |
+
display_df = prv.TableMaking(verbalised_claims_df_final, result)
|
75 |
+
conn.commit()
|
76 |
+
conn.close()
|
77 |
+
return display_df
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
with gr.Blocks() as demo:
|
82 |
+
print("gradio started!")
|
83 |
+
gr.Markdown(
|
84 |
+
"""
|
85 |
+
# Prove
|
86 |
+
This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
|
87 |
+
"""
|
88 |
+
)
|
89 |
+
inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q245247)")
|
90 |
+
out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
|
91 |
+
run_button_1 = gr.Button("Start parsing")
|
92 |
+
run_button_1.click(wtr_process, inp, out)
|
93 |
+
|
94 |
+
|
95 |
+
gr.Markdown(
|
96 |
+
"""
|
97 |
+
Pre-trained language models-based text entailment.
|
98 |
+
"""
|
99 |
+
)
|
100 |
+
out_2 = gr.HTML(label="Results")
|
101 |
+
run_button_2 = gr.Button("Start processing")
|
102 |
+
run_button_2.click(prv_process, inp, out_2)
|
103 |
+
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
#DB initialising
|
107 |
+
if os.path.isfile('wikidata_claims_refs_parsed.db') != True:
|
108 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
109 |
+
target_QID = 'Q115305900'
|
110 |
+
wtr.claimParser(target_QID) #save results in .db
|
111 |
+
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
|
112 |
+
url_set = wtr.urlParser(target_QID) #from ref table in .db
|
113 |
+
html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
|
114 |
+
claim_text = wtr.claim2text(html_set) #Claims generation
|
115 |
+
html_text = wtr.html2text(html_set)
|
116 |
+
claim_text = claim_text.astype(str)
|
117 |
+
html_text = html_text.astype(str)
|
118 |
+
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
|
119 |
+
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
|
120 |
+
conn.commit()
|
121 |
+
conn.close()
|
122 |
+
demo.launch(share=True)
|
SimpleUI_llm.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import Wikidata_Text_Parser as wtr
|
3 |
+
import sqlite3
|
4 |
+
import CodeArchive.Prove_llm as prv
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
def wtr_process(qid):
|
8 |
+
try:
|
9 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
10 |
+
target_QID = qid
|
11 |
+
|
12 |
+
cursor = conn.cursor()
|
13 |
+
|
14 |
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='claims'")
|
15 |
+
table_exists = cursor.fetchone()
|
16 |
+
|
17 |
+
if table_exists:
|
18 |
+
cursor.execute("SELECT entity_id FROM claims WHERE entity_id=?", (target_QID,))
|
19 |
+
result = cursor.fetchone()
|
20 |
+
|
21 |
+
if result is not None and result[0] == target_QID:
|
22 |
+
print(result)
|
23 |
+
print(f"{target_QID} already exists in the 'claims' table. Skipping execution.")
|
24 |
+
else:
|
25 |
+
progress = gr.Progress(0)
|
26 |
+
progress(0.00, desc="Wikidata claims parsing...")
|
27 |
+
wtr.claimParser(target_QID) #save results in .db
|
28 |
+
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
|
29 |
+
progress(0.25, desc="URL and HTML parsing...")
|
30 |
+
url_set = wtr.urlParser() #from ref table in .db
|
31 |
+
html_set = wtr.htmlParser(url_set, qid) #Original html docs collection
|
32 |
+
progress(0.50, desc="claim2Text...")
|
33 |
+
claim_text = wtr.claim2text(html_set) #Claims generation
|
34 |
+
progress(0.74, desc="html2Text...")
|
35 |
+
html_text = wtr.html2text(html_set)
|
36 |
+
claim_text = claim_text.astype(str)
|
37 |
+
html_text = html_text.astype(str)
|
38 |
+
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
|
39 |
+
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
|
40 |
+
progress(1, desc="completed...")
|
41 |
+
else:
|
42 |
+
progress = gr.Progress(0)
|
43 |
+
progress(0.00, desc="Wikidata claims parsing...")
|
44 |
+
wtr.claimParser(target_QID) #save results in .db
|
45 |
+
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
|
46 |
+
progress(0.25, desc="URL and HTML parsing...")
|
47 |
+
url_set = wtr.urlParser() #from ref table in .db
|
48 |
+
html_set = wtr.htmlParser(url_set) #Original html docs collection
|
49 |
+
progress(0.50, desc="claim2Text...")
|
50 |
+
claim_text = wtr.claim2text(html_set) #Claims generation
|
51 |
+
progress(0.74, desc="html2Text...")
|
52 |
+
html_text = wtr.html2text(html_set)
|
53 |
+
claim_text = claim_text.astype(str)
|
54 |
+
html_text = html_text.astype(str)
|
55 |
+
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
|
56 |
+
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
|
57 |
+
progress(1, desc="completed...")
|
58 |
+
|
59 |
+
|
60 |
+
query = f"""
|
61 |
+
SELECT
|
62 |
+
claim_text.entity_label,
|
63 |
+
claim_text.property_label,
|
64 |
+
claim_text.object_label,
|
65 |
+
html_text.url
|
66 |
+
FROM claim_text
|
67 |
+
INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
|
68 |
+
WHERE claim_text.entity_id = '{target_QID}'
|
69 |
+
"""
|
70 |
+
|
71 |
+
result_df = pd.read_sql_query(query, conn)
|
72 |
+
|
73 |
+
conn.commit()
|
74 |
+
conn.close()
|
75 |
+
|
76 |
+
return result_df
|
77 |
+
|
78 |
+
except Exception as e:
|
79 |
+
error_df = pd.DataFrame({'Error': [str(e)]})
|
80 |
+
return error_df
|
81 |
+
|
82 |
+
|
83 |
+
def prv_process(qid):
|
84 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
85 |
+
|
86 |
+
query = f"""
|
87 |
+
SELECT html_text.*
|
88 |
+
FROM html_text
|
89 |
+
INNER JOIN claim_text ON html_text.reference_id = claim_text.reference_id
|
90 |
+
WHERE claim_text.entity_id = '{qid}'
|
91 |
+
"""
|
92 |
+
reference_text_df = pd.read_sql_query(query, conn)
|
93 |
+
query = f"SELECT * FROM claim_text WHERE entity_id = '{qid}'"
|
94 |
+
claim_df = pd.read_sql_query(query, conn)
|
95 |
+
|
96 |
+
verbalised_claims_df_final = prv.verbalisation(claim_df)
|
97 |
+
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
|
98 |
+
|
99 |
+
def update_progress(curr_step, total_steps):
|
100 |
+
progress((curr_step + 1) / total_steps)
|
101 |
+
|
102 |
+
result = prv.RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
|
103 |
+
|
104 |
+
conn.close()
|
105 |
+
return result
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
with gr.Blocks() as demo:
|
110 |
+
print("gradio started!")
|
111 |
+
gr.Markdown(
|
112 |
+
"""
|
113 |
+
# Reference Quality Verification Tool
|
114 |
+
This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
|
115 |
+
Parsing could take 3~5 mins depending on the number of references.
|
116 |
+
"""
|
117 |
+
)
|
118 |
+
inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
|
119 |
+
out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
|
120 |
+
run_button_1 = gr.Button("Start parsing")
|
121 |
+
run_button_1.click(wtr_process, inp, out)
|
122 |
+
|
123 |
+
|
124 |
+
gr.Markdown(
|
125 |
+
"""
|
126 |
+
LLM-based HTML parsing and verification !
|
127 |
+
"""
|
128 |
+
)
|
129 |
+
out_2 = gr.DataFrame(label="LLM-based verificaiton result")
|
130 |
+
|
131 |
+
run_button_2 = gr.Button("Start processing")
|
132 |
+
run_button_2.click(prv_process, inp, out_2)
|
133 |
+
|
134 |
+
|
135 |
+
if __name__ == "__main__":
|
136 |
+
demo.launch(share=True)
|
UI_tester.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import Wikidata_Text_Parser as wtr
|
3 |
+
import sqlite3
|
4 |
+
|
5 |
+
def process_input(qid):
|
6 |
+
progress = gr.Progress(0)
|
7 |
+
|
8 |
+
wtr.claimParser(qid)
|
9 |
+
|
10 |
+
progress(0.20, desc="Filtering properties...")
|
11 |
+
filtered_df = wtr.propertyFiltering(qid)
|
12 |
+
|
13 |
+
progress(0.40, desc="Parsing URLs...")
|
14 |
+
url_set = wtr.urlParser()
|
15 |
+
|
16 |
+
progress(0.60, desc="Parsing HTML...")
|
17 |
+
html_set = wtr.htmlParser(url_set)
|
18 |
+
|
19 |
+
progress(0.80, desc="Generating claim text...")
|
20 |
+
claim_text = wtr.claim2text(html_set) #Claims generation
|
21 |
+
|
22 |
+
progress(1, desc="Generating claim text...")
|
23 |
+
html_text = wtr.html2text(html_set)
|
24 |
+
|
25 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
26 |
+
claim_text = claim_text.astype(str)
|
27 |
+
html_text = html_text.astype(str)
|
28 |
+
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
|
29 |
+
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
|
30 |
+
conn.commit()
|
31 |
+
conn.close()
|
32 |
+
return f"{html_text.shape[0]} HTMl documents collection via references of {qid}"
|
33 |
+
|
34 |
+
with gr.Blocks() as demo:
|
35 |
+
gr.Markdown(
|
36 |
+
"""
|
37 |
+
# Reference Quality Verification Tool
|
38 |
+
This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
|
39 |
+
|
40 |
+
Parsing could take 3~5 mins depending on the number of references.
|
41 |
+
"""
|
42 |
+
)
|
43 |
+
|
44 |
+
inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
|
45 |
+
out = gr.Textbox(label="Parsing result")
|
46 |
+
run_button = gr.Button("Start parsing")
|
47 |
+
run_button.click(process_input, inp, out)
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
demo.launch()
|
Wikidata_Text_Parser.py
ADDED
@@ -0,0 +1,929 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from tqdm import tqdm
|
3 |
+
import pandas as pd
|
4 |
+
import os, sqlite3, traceback, ast, requests, fasttext, re, time, string, spacy, pysbd
|
5 |
+
from requests.exceptions import ReadTimeout, TooManyRedirects, ConnectionError, ConnectTimeout, InvalidSchema, InvalidURL
|
6 |
+
from qwikidata.linked_data_interface import get_entity_dict_from_api
|
7 |
+
from datetime import datetime
|
8 |
+
import utils.wikidata_utils as wdutils
|
9 |
+
from importlib import reload
|
10 |
+
from urllib.parse import urlparse, unquote
|
11 |
+
from urllib import parse
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
+
from IPython.display import clear_output
|
14 |
+
from os.path import exists
|
15 |
+
from pathlib import Path
|
16 |
+
from nltk.tokenize import sent_tokenize
|
17 |
+
from sentence_splitter import SentenceSplitter, split_text_into_sentences
|
18 |
+
|
19 |
+
|
20 |
+
class DatabaseExtractor():
|
21 |
+
def __init__(self, dbname='wikidata_claims_refs_parsed.db'):
|
22 |
+
self.dbname = dbname
|
23 |
+
self.prepare_extraction()
|
24 |
+
|
25 |
+
def finish_extraction(self):
|
26 |
+
self.db.commit()
|
27 |
+
|
28 |
+
def prepare_extraction(self):
|
29 |
+
self.db = sqlite3.connect(self.dbname)
|
30 |
+
self.cursor = self.db.cursor()
|
31 |
+
|
32 |
+
self.cursor.execute('''
|
33 |
+
CREATE TABLE IF NOT EXISTS claims(
|
34 |
+
entity_id TEXT,
|
35 |
+
claim_id TEXT,
|
36 |
+
rank TEXT,
|
37 |
+
property_id TEXT,
|
38 |
+
datatype TEXT,
|
39 |
+
datavalue TEXT,
|
40 |
+
PRIMARY KEY (
|
41 |
+
claim_id
|
42 |
+
)
|
43 |
+
)''')
|
44 |
+
|
45 |
+
self.cursor.execute('''
|
46 |
+
CREATE TABLE IF NOT EXISTS claims_refs(
|
47 |
+
claim_id TEXT,
|
48 |
+
reference_id TEXT,
|
49 |
+
PRIMARY KEY (
|
50 |
+
claim_id,
|
51 |
+
reference_id
|
52 |
+
)
|
53 |
+
)''')
|
54 |
+
|
55 |
+
self.cursor.execute('''
|
56 |
+
CREATE TABLE IF NOT EXISTS refs(
|
57 |
+
reference_id TEXT,
|
58 |
+
reference_property_id TEXT,
|
59 |
+
reference_index TEXT,
|
60 |
+
reference_datatype TEXT,
|
61 |
+
reference_value TEXT,
|
62 |
+
PRIMARY KEY (
|
63 |
+
reference_id,
|
64 |
+
reference_property_id,
|
65 |
+
reference_index
|
66 |
+
)
|
67 |
+
)''')
|
68 |
+
self.db.commit()
|
69 |
+
|
70 |
+
def extract_claim(self, entity_id, claim):
|
71 |
+
if claim['mainsnak']['snaktype'] == 'value':
|
72 |
+
value = str(claim['mainsnak']['datavalue'])
|
73 |
+
else:
|
74 |
+
value = claim['mainsnak']['snaktype']
|
75 |
+
try:
|
76 |
+
self.cursor.execute('''
|
77 |
+
INSERT INTO claims(entity_id, claim_id, rank, property_id, datatype, datavalue)
|
78 |
+
VALUES($var,$var,$var,$var,$var,$var)'''.replace('$var','?'), (
|
79 |
+
entity_id,claim['id'],claim['rank'],
|
80 |
+
claim['mainsnak']['property'],claim['mainsnak']['datatype'],value
|
81 |
+
))
|
82 |
+
except UnicodeEncodeError:
|
83 |
+
print(entity_id,claim['id'],claim['rank'],
|
84 |
+
claim['mainsnak']['property'],claim['mainsnak']['datatype'],value)
|
85 |
+
raise
|
86 |
+
except sqlite3.IntegrityError as err:
|
87 |
+
#self.db.rollback()
|
88 |
+
self.cursor.execute(
|
89 |
+
'''SELECT *
|
90 |
+
FROM claims
|
91 |
+
WHERE claim_id=$var
|
92 |
+
'''.replace('$var','?'), (claim['id'],)
|
93 |
+
)
|
94 |
+
conflicted_value = self.cursor.fetchone()
|
95 |
+
if conflicted_value == (entity_id,claim['id'],claim['rank'],
|
96 |
+
claim['mainsnak']['property'],claim['mainsnak']['datatype'],value):
|
97 |
+
pass
|
98 |
+
else:
|
99 |
+
print(err, claim['id'])
|
100 |
+
traceback.print_exc()
|
101 |
+
raise err
|
102 |
+
finally:
|
103 |
+
#self.db.commit()
|
104 |
+
pass
|
105 |
+
|
106 |
+
def extract_reference(self, ref):
|
107 |
+
for snaks in ref['snaks'].values():
|
108 |
+
for i, snak in enumerate(snaks):
|
109 |
+
if snak['snaktype'] == 'value':
|
110 |
+
value = str(snak['datavalue'])
|
111 |
+
else:
|
112 |
+
value = snak['snaktype']
|
113 |
+
try:
|
114 |
+
self.cursor.execute('''
|
115 |
+
INSERT INTO refs(reference_id, reference_property_id, reference_index,
|
116 |
+
reference_datatype, reference_value)
|
117 |
+
VALUES($var,$var,$var,$var,$var)'''.replace('$var','?'), (
|
118 |
+
ref['hash'],snak['property'],str(i),snak['datatype'],value
|
119 |
+
))
|
120 |
+
except sqlite3.IntegrityError as err:
|
121 |
+
#self.db.rollback()
|
122 |
+
self.cursor.execute(# WE DONT USE THE INDEX HERE, THEY TEND TO COME SHUFFLED FROM API AND SORTING TAKES TOO LONG
|
123 |
+
'''SELECT reference_id, reference_property_id, reference_datatype, reference_value
|
124 |
+
FROM refs
|
125 |
+
WHERE reference_id = $var
|
126 |
+
AND reference_property_id = $var
|
127 |
+
'''.replace('$var','?'), (ref['hash'],snak['property'])
|
128 |
+
)
|
129 |
+
conflicted_values = self.cursor.fetchall()
|
130 |
+
if (ref['hash'],snak['property'],snak['datatype'],value) in conflicted_values:
|
131 |
+
pass
|
132 |
+
else:
|
133 |
+
print(err, ref['hash'],snak['property'],i)
|
134 |
+
print('trying to insert:',(ref['hash'],snak['property'],str(i),snak['datatype'],value))
|
135 |
+
traceback.print_exc()
|
136 |
+
raise err
|
137 |
+
finally:
|
138 |
+
#self.db.commit()
|
139 |
+
pass
|
140 |
+
|
141 |
+
def extract_claim_reference(self, claim, ref):
|
142 |
+
claim['id'],ref['hash']
|
143 |
+
try:
|
144 |
+
self.cursor.execute('''
|
145 |
+
INSERT INTO claims_refs(claim_id, reference_id)
|
146 |
+
VALUES($var,$var)'''.replace('$var','?'), (
|
147 |
+
claim['id'],ref['hash']
|
148 |
+
))
|
149 |
+
except sqlite3.IntegrityError as err:
|
150 |
+
#db.rollback()
|
151 |
+
pass
|
152 |
+
finally:
|
153 |
+
#self.db.commit()
|
154 |
+
pass
|
155 |
+
|
156 |
+
def extract_entity(self, e):
|
157 |
+
for outgoing_property_id in e['claims'].values():
|
158 |
+
for claim in outgoing_property_id:
|
159 |
+
self.extract_claim(e['id'],claim)
|
160 |
+
if 'references' in claim:
|
161 |
+
for ref in claim['references']:
|
162 |
+
self.extract_claim_reference(claim, ref)
|
163 |
+
self.extract_reference(ref)
|
164 |
+
|
165 |
+
def claimParser(QID):
|
166 |
+
entity_id = QID
|
167 |
+
print('Setting up database ...')
|
168 |
+
extractor = DatabaseExtractor()
|
169 |
+
|
170 |
+
print('Fetching entity from API ...')
|
171 |
+
entity = get_entity_dict_from_api(entity_id)
|
172 |
+
|
173 |
+
if entity:
|
174 |
+
print(f'Parsing entity: {entity_id}')
|
175 |
+
extractor.extract_entity(entity)
|
176 |
+
else:
|
177 |
+
print(f'Failed to fetch entity: {entity_id}')
|
178 |
+
|
179 |
+
extractor.finish_extraction()
|
180 |
+
|
181 |
+
def propertyFiltering(QID):
|
182 |
+
reload(wdutils)
|
183 |
+
DB_PATH = 'wikidata_claims_refs_parsed.db'
|
184 |
+
claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
|
185 |
+
|
186 |
+
properties_to_remove = {
|
187 |
+
'general':[
|
188 |
+
'P31', # - instance of
|
189 |
+
'P279',# - subclass of
|
190 |
+
'P373',# - commons category
|
191 |
+
'P910',# - Topic's main category
|
192 |
+
'P7561',# - category for the interior of the item
|
193 |
+
'P5008',# - on focus list of Wikimedia project
|
194 |
+
'P2670',# - has parts of the class
|
195 |
+
'P1740',# - category for films shot at this location
|
196 |
+
'P1612',# - Commons Institution page
|
197 |
+
'P8989',# - category for the view of the item
|
198 |
+
'P2959',# - permanent duplicated item
|
199 |
+
'P7867',# - category for maps
|
200 |
+
'P935' ,# - Commons gallery
|
201 |
+
'P1472',# - Commons Creator page
|
202 |
+
'P8596',# category for the exterior of the item
|
203 |
+
'P5105',# Deutsche Bahn station category
|
204 |
+
'P8933',# category for the view from the item
|
205 |
+
'P642',# of
|
206 |
+
'P3876',# category for alumni of educational institution
|
207 |
+
'P1791',# category of people buried here
|
208 |
+
'P7084',# related category
|
209 |
+
'P1465',# category for people who died here
|
210 |
+
'P1687',# Wikidata property
|
211 |
+
'P6104',# maintained by WikiProject
|
212 |
+
'P4195',# category for employees of the organization
|
213 |
+
'P1792',# category of associated people
|
214 |
+
'P5869',# model item
|
215 |
+
'P1659',# see also
|
216 |
+
'P1464',# category for people born here
|
217 |
+
'P2354',# has list
|
218 |
+
'P1424',# topic's main template
|
219 |
+
'P7782',# category for ship name
|
220 |
+
'P179',# part of the series
|
221 |
+
'P7888',# merged into
|
222 |
+
'P6365',# member category
|
223 |
+
'P8464',# content partnership category
|
224 |
+
'P360',# is a list of
|
225 |
+
'P805',# statement is subject of
|
226 |
+
'P8703',# entry in abbreviations table
|
227 |
+
'P1456',# list of monuments
|
228 |
+
'P1012',# including
|
229 |
+
'P1151',# topic's main Wikimedia portal
|
230 |
+
'P2490',# page at OSTIS Belarus Wiki
|
231 |
+
'P593',# HomoloGene ID
|
232 |
+
'P8744',# economy of topic
|
233 |
+
'P2614',# World Heritage criteria
|
234 |
+
'P2184',# history of topic
|
235 |
+
'P9241',# demographics of topic
|
236 |
+
'P487',#Unicode character
|
237 |
+
'P1754',#category related to list
|
238 |
+
'P2559',#Wikidata usage instructions
|
239 |
+
'P2517',#category for recipients of this award
|
240 |
+
'P971',#category combines topics
|
241 |
+
'P6112',# category for members of a team
|
242 |
+
'P4224',#category contains
|
243 |
+
'P301',#category's main topic
|
244 |
+
'P1753',#list related to category
|
245 |
+
'P1423',#template has topic
|
246 |
+
'P1204',#Wikimedia portal's main topic
|
247 |
+
'P3921',#Wikidata SPARQL query equivalent
|
248 |
+
'P1963',#properties for this type
|
249 |
+
'P5125',#Wikimedia outline
|
250 |
+
'P3176',#uses property
|
251 |
+
'P8952',#inappropriate property for this type
|
252 |
+
'P2306',#property
|
253 |
+
'P5193',#Wikidata property example for forms
|
254 |
+
'P5977',#Wikidata property example for senses
|
255 |
+
],
|
256 |
+
'specific': {}
|
257 |
+
}
|
258 |
+
|
259 |
+
db = sqlite3.connect(DB_PATH)
|
260 |
+
cursor = db.cursor()
|
261 |
+
# To see how many out of the total number of stored claims we are excluding by removing the general properties
|
262 |
+
sql_query = "select count(*) from claims where property_id in $1;"
|
263 |
+
sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in properties_to_remove['general']]) + ')')
|
264 |
+
cursor.execute(sql_query)
|
265 |
+
print('Removing the',len(properties_to_remove['general']),'properties deemed as ontological or unverbalisable')
|
266 |
+
cursor = db.cursor()
|
267 |
+
|
268 |
+
sql_query = "select * from claims where entity_id in $1;"
|
269 |
+
sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in [QID]]) + ')')
|
270 |
+
|
271 |
+
cursor.execute(sql_query)
|
272 |
+
theme_df = pd.DataFrame(cursor.fetchall())
|
273 |
+
theme_df.columns = claims_columns
|
274 |
+
|
275 |
+
original_theme_df_size = theme_df.shape[0]
|
276 |
+
last_stage_theme_df_size = original_theme_df_size
|
277 |
+
|
278 |
+
print('- Removing deprecated')
|
279 |
+
|
280 |
+
# Remove deprecated
|
281 |
+
theme_df = theme_df[theme_df['rank'] != 'deprecated'].reset_index(drop=True)
|
282 |
+
print(
|
283 |
+
' - Percentage of deprecated:',
|
284 |
+
round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
|
285 |
+
)
|
286 |
+
last_stage_theme_df_size = theme_df.shape[0]
|
287 |
+
|
288 |
+
print('- Removing bad datatypes')
|
289 |
+
|
290 |
+
# Remove external_ids, commonsMedia (e.g. photos), globe-coordinates, urls
|
291 |
+
bad_datatypes = ['commonsMedia','external-id','globe-coordinate','url', 'wikibase-form',
|
292 |
+
'geo-shape', 'math', 'musical-notation', 'tabular-data', 'wikibase-sense']
|
293 |
+
theme_df = theme_df[
|
294 |
+
theme_df['datatype'].apply(
|
295 |
+
lambda x : x not in bad_datatypes
|
296 |
+
)
|
297 |
+
].reset_index(drop=True)
|
298 |
+
print(
|
299 |
+
' - Percentage of bad datatypes:',
|
300 |
+
round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
|
301 |
+
)
|
302 |
+
last_stage_theme_df_size = theme_df.shape[0]
|
303 |
+
|
304 |
+
print('- Removing bad properties')
|
305 |
+
|
306 |
+
# Remove specific properties such as P31 and P279
|
307 |
+
theme_df = theme_df[
|
308 |
+
theme_df['property_id'].apply(
|
309 |
+
lambda x : (x not in properties_to_remove['general']))
|
310 |
+
|
311 |
+
].reset_index(drop=True)
|
312 |
+
print(
|
313 |
+
' - Percentage of ontology (non-domain) properties:',
|
314 |
+
round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
|
315 |
+
)
|
316 |
+
last_stage_theme_df_size = theme_df.shape[0]
|
317 |
+
|
318 |
+
print('- Removing somevalue/novalue')
|
319 |
+
|
320 |
+
# Remove novalue and somevalue
|
321 |
+
theme_df = theme_df[
|
322 |
+
theme_df['datavalue'].apply(
|
323 |
+
lambda x : x not in ['somevalue', 'novalue']
|
324 |
+
)
|
325 |
+
].reset_index(drop=True)
|
326 |
+
print(
|
327 |
+
' - Percentage of somevalue/novalue:',
|
328 |
+
round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
|
329 |
+
)
|
330 |
+
last_stage_theme_df_size = theme_df.shape[0]
|
331 |
+
|
332 |
+
print(
|
333 |
+
'After all removals, we keep',
|
334 |
+
round(last_stage_theme_df_size/original_theme_df_size*100, 2),
|
335 |
+
)
|
336 |
+
theme_df.to_sql('claims', db, if_exists='replace', index=False)
|
337 |
+
|
338 |
+
return theme_df
|
339 |
+
|
340 |
+
def get_object_label_given_datatype(row):
|
341 |
+
Wd_API = wdutils.CachedWikidataAPI()
|
342 |
+
Wd_API.languages = ['en']
|
343 |
+
def turn_to_century_or_millennium(y, mode):
|
344 |
+
y = str(y)
|
345 |
+
if mode == 'C':
|
346 |
+
div = 100
|
347 |
+
group = int(y.rjust(3, '0')[:-2])
|
348 |
+
mode_name = 'century'
|
349 |
+
elif mode == 'M':
|
350 |
+
div = 1000
|
351 |
+
group = int(y.rjust(4, '0')[:-3])
|
352 |
+
mode_name = 'millenium'
|
353 |
+
else:
|
354 |
+
raise ValueError('Use mode = C for century and M for millennium')
|
355 |
+
|
356 |
+
if int(y)%div != 0:
|
357 |
+
group += 1
|
358 |
+
group = str(group)
|
359 |
+
|
360 |
+
group_suffix = (
|
361 |
+
'st' if group[-1] == '1' else (
|
362 |
+
'nd' if group[-1] == '2' else (
|
363 |
+
'rd' if group[-1] == '3' else 'th'
|
364 |
+
)
|
365 |
+
)
|
366 |
+
)
|
367 |
+
|
368 |
+
return ' '.join([group+group_suffix, mode_name])
|
369 |
+
|
370 |
+
dt = row['datatype']
|
371 |
+
dv = row['datavalue']
|
372 |
+
|
373 |
+
dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
|
374 |
+
if dt not in dt_types:
|
375 |
+
print(dt)
|
376 |
+
raise ValueError
|
377 |
+
else:
|
378 |
+
try:
|
379 |
+
if dt == dt_types[0]:
|
380 |
+
return Wd_API.get_label(ast.literal_eval(dv)['value']['id'], True) #get label here
|
381 |
+
elif dt == dt_types[1]:
|
382 |
+
dv = ast.literal_eval(dv)
|
383 |
+
return (dv['value']['text'], dv['value']['language'])
|
384 |
+
elif dt == dt_types[2]:
|
385 |
+
dv = ast.literal_eval(dv)
|
386 |
+
amount, unit = dv['value']['amount'], dv['value']['unit']
|
387 |
+
if amount[0] == '+':
|
388 |
+
amount = amount[1:]
|
389 |
+
if str(unit) == '1':
|
390 |
+
return (str(amount), 'en')
|
391 |
+
else:
|
392 |
+
unit_entity_id = unit.split('/')[-1]
|
393 |
+
unit = Wd_API.get_label(unit_entity_id, True)#get label here
|
394 |
+
return (' '.join([amount, unit[0]]), unit[1])
|
395 |
+
elif dt == dt_types[3]:
|
396 |
+
dv = ast.literal_eval(dv)
|
397 |
+
time = dv['value']['time']
|
398 |
+
timezone = dv['value']['timezone']
|
399 |
+
precision = dv['value']['precision']
|
400 |
+
assert dv['value']['after'] == 0 and dv['value']['before'] == 0
|
401 |
+
|
402 |
+
sufix = 'BC' if time[0] == '-' else ''
|
403 |
+
time = time[1:]
|
404 |
+
|
405 |
+
if precision == 11: #date
|
406 |
+
return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y') + sufix, 'en')
|
407 |
+
elif precision == 10: #month
|
408 |
+
try:
|
409 |
+
return (datetime.strptime(time, '%Y-%m-00T00:00:%SZ').strftime("%B of %Y") + sufix, 'en')
|
410 |
+
except ValueError:
|
411 |
+
return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime("%B of %Y") + sufix, 'en')
|
412 |
+
elif precision == 9: #year
|
413 |
+
try:
|
414 |
+
return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y') + sufix, 'en')
|
415 |
+
except ValueError:
|
416 |
+
return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y') + sufix, 'en')
|
417 |
+
elif precision == 8: #decade
|
418 |
+
try:
|
419 |
+
return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en')
|
420 |
+
except ValueError:
|
421 |
+
return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en')
|
422 |
+
elif precision == 7: #century
|
423 |
+
try:
|
424 |
+
parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ')
|
425 |
+
except ValueError:
|
426 |
+
parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ')
|
427 |
+
finally:
|
428 |
+
return (turn_to_century_or_millennium(
|
429 |
+
parsed_time.strftime('%Y'), mode='C'
|
430 |
+
) + sufix, 'en')
|
431 |
+
elif precision == 6: #millennium
|
432 |
+
try:
|
433 |
+
parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ')
|
434 |
+
except ValueError:
|
435 |
+
parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ')
|
436 |
+
finally:
|
437 |
+
return (turn_to_century_or_millennium(
|
438 |
+
parsed_time.strftime('%Y'), mode='M'
|
439 |
+
) + sufix, 'en')
|
440 |
+
elif precision == 4: #hundred thousand years
|
441 |
+
timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
|
442 |
+
timeint = round(timeint/1e5,1)
|
443 |
+
return (str(timeint) + 'hundred thousand years' + sufix, 'en')
|
444 |
+
elif precision == 3: #million years
|
445 |
+
timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
|
446 |
+
timeint = round(timeint/1e6,1)
|
447 |
+
return (str(timeint) + 'million years' + sufix, 'en')
|
448 |
+
elif precision == 0: #billion years
|
449 |
+
timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
|
450 |
+
timeint = round(timeint/1e9,1)
|
451 |
+
return (str(timeint) + 'billion years' +sufix, 'en')
|
452 |
+
elif dt == dt_types[4]:
|
453 |
+
return (ast.literal_eval(dv)['value'], 'en')
|
454 |
+
except ValueError as e:
|
455 |
+
#pdb.set_trace()
|
456 |
+
raise e
|
457 |
+
|
458 |
+
def get_object_desc_given_datatype(row):
|
459 |
+
Wd_API = wdutils.CachedWikidataAPI()
|
460 |
+
Wd_API.languages = ['en']
|
461 |
+
dt = row['datatype']
|
462 |
+
dv = row['datavalue']
|
463 |
+
|
464 |
+
dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
|
465 |
+
if dt not in dt_types:
|
466 |
+
print(dt)
|
467 |
+
raise ValueError
|
468 |
+
else:
|
469 |
+
try:
|
470 |
+
if dt == dt_types[0]:
|
471 |
+
return Wd_API.get_desc(ast.literal_eval(dv)['value']['id']) #get label here
|
472 |
+
elif dt == dt_types[1]:
|
473 |
+
return ('no-desc', 'none')
|
474 |
+
elif dt == dt_types[2]:
|
475 |
+
dv = ast.literal_eval(dv)
|
476 |
+
amount, unit = dv['value']['amount'], dv['value']['unit']
|
477 |
+
if amount[0] == '+':
|
478 |
+
amount = amount[1:]
|
479 |
+
if str(unit) == '1':
|
480 |
+
return ('no-desc', 'none')
|
481 |
+
else:
|
482 |
+
unit_entity_id = unit.split('/')[-1]
|
483 |
+
return Wd_API.get_desc(unit_entity_id)
|
484 |
+
elif dt == dt_types[3]:
|
485 |
+
return ('no-desc', 'none')
|
486 |
+
elif dt == dt_types[4]:
|
487 |
+
return ('no-desc', 'none')
|
488 |
+
except ValueError as e:
|
489 |
+
#pdb.set_trace()
|
490 |
+
raise e
|
491 |
+
|
492 |
+
def get_object_alias_given_datatype(row):
|
493 |
+
Wd_API = wdutils.CachedWikidataAPI()
|
494 |
+
Wd_API.languages = ['en']
|
495 |
+
dt = row['datatype']
|
496 |
+
dv = row['datavalue']
|
497 |
+
|
498 |
+
dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
|
499 |
+
if dt not in dt_types:
|
500 |
+
print(dt)
|
501 |
+
raise ValueError
|
502 |
+
else:
|
503 |
+
try:
|
504 |
+
if dt == dt_types[0]:
|
505 |
+
return Wd_API.get_alias(ast.literal_eval(dv)['value']['id']) #get label here
|
506 |
+
elif dt == dt_types[1]:
|
507 |
+
return ('no-alias', 'none')
|
508 |
+
elif dt == dt_types[2]:
|
509 |
+
dv = ast.literal_eval(dv)
|
510 |
+
amount, unit = dv['value']['amount'], dv['value']['unit']
|
511 |
+
if amount[0] == '+':
|
512 |
+
amount = amount[1:]
|
513 |
+
if str(unit) == '1':
|
514 |
+
return ('no-alias', 'none')
|
515 |
+
else:
|
516 |
+
unit_entity_id = unit.split('/')[-1]
|
517 |
+
return Wd_API.get_alias(unit_entity_id)
|
518 |
+
elif dt == dt_types[3]:
|
519 |
+
dv = ast.literal_eval(dv)
|
520 |
+
time = dv['value']['time']
|
521 |
+
timezone = dv['value']['timezone']
|
522 |
+
precision = dv['value']['precision']
|
523 |
+
assert dv['value']['after'] == 0 and dv['value']['before'] == 0
|
524 |
+
|
525 |
+
sufix = 'BC' if time[0] == '-' else ''
|
526 |
+
time = time[1:]
|
527 |
+
|
528 |
+
if precision == 11: #date
|
529 |
+
return ([
|
530 |
+
datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%-d of %B, %Y') + sufix,
|
531 |
+
datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y (dd/mm/yyyy)') + sufix,
|
532 |
+
datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%b %-d, %Y') + sufix
|
533 |
+
], 'en')
|
534 |
+
else: #month
|
535 |
+
return ('no-alias', 'none')
|
536 |
+
elif dt == dt_types[4]:
|
537 |
+
return ('no-alias', 'none')
|
538 |
+
except ValueError as e:
|
539 |
+
#pdb.set_trace()
|
540 |
+
raise e
|
541 |
+
|
542 |
+
def textualAugmentation(filtered_df):
|
543 |
+
|
544 |
+
Wd_API = wdutils.CachedWikidataAPI()
|
545 |
+
Wd_API.languages = ['en']
|
546 |
+
|
547 |
+
filtered_df['entity_label'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_label(x, True))
|
548 |
+
filtered_df['entity_desc'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_desc(x))
|
549 |
+
filtered_df['entity_alias'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_alias(x))
|
550 |
+
|
551 |
+
print(' - Predicate augmentation...')
|
552 |
+
filtered_df['property_label'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_label(x, True))
|
553 |
+
filtered_df['property_desc'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_desc(x))
|
554 |
+
filtered_df['property_alias'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_alias(x))
|
555 |
+
|
556 |
+
print(' - Object augmentation...')
|
557 |
+
filtered_df['object_label'] = filtered_df.apply(get_object_label_given_datatype, axis=1)
|
558 |
+
filtered_df['object_desc'] = filtered_df.apply(get_object_desc_given_datatype, axis=1)
|
559 |
+
filtered_df['object_alias'] = filtered_df.apply(get_object_alias_given_datatype, axis=1)
|
560 |
+
|
561 |
+
|
562 |
+
no_subject_label_perc = filtered_df[filtered_df['entity_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
|
563 |
+
print(' - No subject label %:', no_subject_label_perc, '%')
|
564 |
+
|
565 |
+
no_predicate_label_perc = filtered_df[filtered_df['property_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
|
566 |
+
print(' - No predicate label %:', no_predicate_label_perc, '%')
|
567 |
+
|
568 |
+
no_object_label_perc = filtered_df[filtered_df['object_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
|
569 |
+
print(' - No object label %:', no_object_label_perc, '%')
|
570 |
+
return filtered_df
|
571 |
+
|
572 |
+
def urlParser(target_QID):
|
573 |
+
Wd_API = wdutils.CachedWikidataAPI()
|
574 |
+
Wd_API.languages = ['en']
|
575 |
+
db = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
576 |
+
cursor = db.cursor()
|
577 |
+
refs_columns = ['reference_id','reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']
|
578 |
+
cursor.execute('select * from refs where reference_datatype="url";')
|
579 |
+
url_df = pd.DataFrame(cursor.fetchall())
|
580 |
+
url_df.columns = refs_columns
|
581 |
+
def reference_value_to_url(reference_value):
|
582 |
+
if reference_value in ['novalue','somevalue']:
|
583 |
+
return reference_value
|
584 |
+
reference_value = ast.literal_eval(reference_value)
|
585 |
+
assert reference_value['type'] == 'string'
|
586 |
+
return reference_value['value']
|
587 |
+
def reference_value_to_external_id(reference_value):
|
588 |
+
if reference_value in ['novalue','somevalue']:
|
589 |
+
return reference_value
|
590 |
+
reference_value = ast.literal_eval(reference_value)
|
591 |
+
assert reference_value['type'] == 'string'
|
592 |
+
return reference_value['value']
|
593 |
+
def get_formatter_url(entity_id):
|
594 |
+
try:
|
595 |
+
sparql_query = '''
|
596 |
+
SELECT ?item ?itemLabel
|
597 |
+
WHERE
|
598 |
+
{
|
599 |
+
wd:$1 wdt:P1630 ?item.
|
600 |
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
|
601 |
+
}
|
602 |
+
'''.replace('$1',entity_id)
|
603 |
+
sparql_results = Wd_API.query_sparql_endpoint(sparql_query)
|
604 |
+
if len(sparql_results['results']['bindings']) > 0:
|
605 |
+
return sparql_results['results']['bindings'][0]['item']['value']
|
606 |
+
else:
|
607 |
+
return 'no_formatter_url'
|
608 |
+
except Exception:
|
609 |
+
print(entity_id)
|
610 |
+
print(sparql_results)
|
611 |
+
raise
|
612 |
+
url_df['url'] = url_df.reference_value.apply(reference_value_to_url)
|
613 |
+
cursor.execute('select * from refs where reference_datatype="url";')
|
614 |
+
ext_id_df = pd.DataFrame(cursor.fetchall())
|
615 |
+
ext_id_df.columns = refs_columns
|
616 |
+
ext_id_df['ext_id'] = ext_id_df.reference_value.apply(reference_value_to_external_id)
|
617 |
+
ext_id_df['formatter_url'] = ext_id_df['reference_property_id'].apply(get_formatter_url)
|
618 |
+
ext_id_df['url'] = ext_id_df.apply(lambda x : x['formatter_url'].replace('$1', x['ext_id']), axis=1)
|
619 |
+
columns_for_join = ['reference_id', 'reference_property_id','reference_index','reference_datatype','url']
|
620 |
+
url_df_pre_join = url_df[columns_for_join]
|
621 |
+
ext_id_df_pre_join = ext_id_df[columns_for_join]
|
622 |
+
all_url_df = pd.concat([url_df_pre_join,ext_id_df_pre_join])
|
623 |
+
all_url_df = all_url_df.sort_values(['reference_id','reference_index'])
|
624 |
+
# drop those with url = 'no_formatter_url'
|
625 |
+
all_url_df = all_url_df[all_url_df['url'] != 'no_formatter_url'].reset_index(drop=True)
|
626 |
+
# drop those with url = somevalue and novalue
|
627 |
+
all_url_df = all_url_df[~all_url_df['url'].isin(['somevalue','novalue'])]
|
628 |
+
reference_id_counts = all_url_df.reference_id.value_counts().reset_index()
|
629 |
+
reference_id_counts.columns = ['reference_id', 'counts']
|
630 |
+
reference_id_counts_equal_1 = reference_id_counts[reference_id_counts['counts'] == 1].reference_id.tolist()
|
631 |
+
all_url_df_eq1 = all_url_df[all_url_df.reference_id.isin(reference_id_counts_equal_1)]
|
632 |
+
all_url_df_eq1 = all_url_df_eq1.reset_index(drop=True).drop('reference_index', axis=1)
|
633 |
+
return all_url_df_eq1
|
634 |
+
|
635 |
+
def htmlParser(url_set, qid):
|
636 |
+
text_reference_sampled_df = url_set
|
637 |
+
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
|
638 |
+
text_reference_sampled_df['html'] = None
|
639 |
+
for i, row in text_reference_sampled_df.iterrows():
|
640 |
+
|
641 |
+
print(i, row.url)
|
642 |
+
try:
|
643 |
+
response = requests.get(row.url, timeout=10)
|
644 |
+
if response.status_code == 200:
|
645 |
+
html = response.text
|
646 |
+
text_reference_sampled_df.loc[i, 'html'] = html
|
647 |
+
else:
|
648 |
+
print(f"not response, {response.status_code}")
|
649 |
+
text_reference_sampled_df.loc[i, 'html'] = response.status_code
|
650 |
+
except requests.exceptions.Timeout:
|
651 |
+
print("Timeout occurred while fetching the URL:", row.url)
|
652 |
+
text_reference_sampled_df.loc[i, 'html'] = 'TimeOut'
|
653 |
+
pass
|
654 |
+
except Exception as e:
|
655 |
+
print("An error occurred:", str(e))
|
656 |
+
pass
|
657 |
+
text_reference_sampled_df_html = text_reference_sampled_df.copy()
|
658 |
+
text_reference_sampled_df_html['entity_id'] = qid
|
659 |
+
return text_reference_sampled_df_html
|
660 |
+
|
661 |
+
def claim2text(html_set):
|
662 |
+
text_reference_sampled_df_html = html_set
|
663 |
+
Wd_API = wdutils.CachedWikidataAPI()
|
664 |
+
Wd_API.languages = ['en']
|
665 |
+
db = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
666 |
+
cursor = db.cursor()
|
667 |
+
claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
|
668 |
+
refs_columns = ['reference_id', 'reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']
|
669 |
+
|
670 |
+
def reference_id_to_claim_id(reference_id):
|
671 |
+
cursor.execute(f'select claim_id from claims_refs where reference_id="{reference_id}"')
|
672 |
+
sql_result = cursor.fetchall()
|
673 |
+
#return sql_result
|
674 |
+
randomly_chosen_claim_id = np.array(sql_result).reshape(-1)
|
675 |
+
return randomly_chosen_claim_id
|
676 |
+
|
677 |
+
def reference_id_to_claim_data(reference_id):
|
678 |
+
claim_ids = reference_id_to_claim_id(reference_id)
|
679 |
+
r = []
|
680 |
+
for claim_id in claim_ids:
|
681 |
+
#print(claim_id)
|
682 |
+
cursor.execute(f'select * from claims where claim_id="{claim_id}";')
|
683 |
+
d = cursor.fetchall()
|
684 |
+
r = r + d
|
685 |
+
return r
|
686 |
+
|
687 |
+
claim_data = []
|
688 |
+
for reference_id in text_reference_sampled_df_html.reference_id:
|
689 |
+
data = reference_id_to_claim_data(reference_id)
|
690 |
+
#print(data)
|
691 |
+
data = [(reference_id,) + t for t in data]
|
692 |
+
claim_data = claim_data + data
|
693 |
+
#break
|
694 |
+
|
695 |
+
claim_df = pd.DataFrame(claim_data, columns = ['reference_id'] + claims_columns)
|
696 |
+
claim_df
|
697 |
+
|
698 |
+
def claim_id_to_claim_url(claim_id):
|
699 |
+
claim_id_parts = claim_id.split('$')
|
700 |
+
return f'https://www.wikidata.org/wiki/{claim_id_parts[0]}#{claim_id}'
|
701 |
+
|
702 |
+
BAD_DATATYPES = ['external-id','commonsMedia','url', 'globe-coordinate', 'wikibase-lexeme', 'wikibase-property']
|
703 |
+
|
704 |
+
assert claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reference_id.unique().shape\
|
705 |
+
== claim_df.reference_id.unique().shape
|
706 |
+
|
707 |
+
print(claim_df.reference_id.unique().shape[0])
|
708 |
+
claim_df = claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reset_index(drop=True)
|
709 |
+
|
710 |
+
from tqdm.auto import tqdm
|
711 |
+
tqdm.pandas()
|
712 |
+
|
713 |
+
claim_df[['entity_label','entity_label_lan']] = pd.DataFrame(
|
714 |
+
claim_df.entity_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist()
|
715 |
+
)
|
716 |
+
claim_df[['property_label','property_label_lan']] = pd.DataFrame(
|
717 |
+
claim_df.property_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist()
|
718 |
+
)
|
719 |
+
|
720 |
+
claim_df[['entity_alias','entity_alias_lan']] = pd.DataFrame(
|
721 |
+
claim_df.entity_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist()
|
722 |
+
)
|
723 |
+
claim_df[['property_alias','property_alias_lan']] = pd.DataFrame(
|
724 |
+
claim_df.property_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist()
|
725 |
+
)
|
726 |
+
|
727 |
+
claim_df[['entity_desc','entity_desc_lan']] = pd.DataFrame(
|
728 |
+
claim_df.entity_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist()
|
729 |
+
)
|
730 |
+
claim_df[['property_desc','property_desc_lan']] = pd.DataFrame(
|
731 |
+
claim_df.property_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist()
|
732 |
+
)
|
733 |
+
|
734 |
+
claim_df['object_label'] = claim_df.apply(get_object_label_given_datatype, axis=1)
|
735 |
+
claim_df['object_alias'] = claim_df.apply(get_object_alias_given_datatype, axis=1)
|
736 |
+
claim_df['object_desc'] = claim_df.apply(get_object_desc_given_datatype, axis=1)
|
737 |
+
|
738 |
+
claim_df['object_label'], claim_df['object_label_lan'] = zip(*claim_df['object_label'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
|
739 |
+
claim_df['object_alias'], claim_df['object_alias_lan'] = zip(*claim_df['object_alias'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
|
740 |
+
claim_df['object_desc'], claim_df['object_desc_lan'] = zip(*claim_df['object_desc'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
|
741 |
+
|
742 |
+
# Removing bad object labels
|
743 |
+
claim_df = claim_df[claim_df['object_label_lan'] != 'none'].reset_index(drop=True)
|
744 |
+
return claim_df
|
745 |
+
|
746 |
+
def html2text(html_set):
|
747 |
+
reference_html_df = html_set
|
748 |
+
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
|
749 |
+
ft_model = fasttext.load_model('base/lid.176.ftz')
|
750 |
+
def predict_language(text, k=20):
|
751 |
+
ls, scores = ft_model.predict(text, k=k) # top 20 matching languages
|
752 |
+
ls = [l.replace('__label__','') for l in ls]
|
753 |
+
return list(zip(ls,scores))
|
754 |
+
def get_url_language(html):
|
755 |
+
try:
|
756 |
+
soup = BeautifulSoup(html, "lxml")
|
757 |
+
[s.decompose() for s in soup("script")] # remove <script> elements
|
758 |
+
if soup.body == None:
|
759 |
+
return ('no body', None)
|
760 |
+
body_text = _RE_COMBINE_WHITESPACE.sub(" ", soup.body.get_text(' ')).strip()
|
761 |
+
return predict_language(body_text, k=1)[0]
|
762 |
+
except Exception:
|
763 |
+
raise
|
764 |
+
def get_text_p_tags(soup):
|
765 |
+
p_tags = soup.find_all('p')
|
766 |
+
text = [p.getText().strip() for p in p_tags if p.getText()]
|
767 |
+
return '\n'.join(text)
|
768 |
+
def clean_text_line_by_line(text, join=True, ch_join = ' ', verb=True):
|
769 |
+
# text = soup.body.get_text()
|
770 |
+
# break into lines and remove leading and trailing space on each
|
771 |
+
lines = list(text.splitlines())
|
772 |
+
lines = (line.strip() for line in lines)
|
773 |
+
# for each line, lets correct double spaces into single space
|
774 |
+
lines = (re.sub(r' {2,}', ' ', line) for line in lines)
|
775 |
+
# for each line, lets correct punctuation spaced to the left
|
776 |
+
lines = (re.sub(r' ([.,:;!?\\-])', r'\1', line) for line in lines)
|
777 |
+
# put periods if missing
|
778 |
+
lines = [line+'.' if line and line[-1] not in string.punctuation else line for i, line in enumerate(lines)]
|
779 |
+
|
780 |
+
if verb:
|
781 |
+
for i, line in enumerate(lines):
|
782 |
+
print(i,line)
|
783 |
+
# drop blank lines
|
784 |
+
if join:
|
785 |
+
return ch_join.join([line for line in lines if line])
|
786 |
+
else:
|
787 |
+
return [line for line in lines if line]
|
788 |
+
|
789 |
+
def apply_manual_rules(text):
|
790 |
+
# RULE: A line ending with a ':' followed by whitespaces and a newline is likely a continuing line and should be joined
|
791 |
+
#text = re.sub(
|
792 |
+
# r':\s*\n',
|
793 |
+
# r': ',
|
794 |
+
# text
|
795 |
+
#)
|
796 |
+
# RULE: Remove [1] reference numbers
|
797 |
+
text = re.sub(r'\[[0-9]+\]', '', text)
|
798 |
+
return text
|
799 |
+
def retrieve_text_from_html(html, soup_parser = 'lxml', verb=True, join=True):
|
800 |
+
if not isinstance(html, str) or 'DOCTYPE html' not in html:
|
801 |
+
return 'No body'
|
802 |
+
soup = BeautifulSoup(html, soup_parser)
|
803 |
+
for script in soup(["script", "style"]):
|
804 |
+
script.decompose()
|
805 |
+
if soup.body == None:
|
806 |
+
return 'No body'
|
807 |
+
[s.unwrap() for s in soup.body.find_all('strong')]
|
808 |
+
|
809 |
+
for p in soup.body.find_all('p'):
|
810 |
+
p.string = _RE_COMBINE_WHITESPACE.sub(" ", p.get_text('')).strip()
|
811 |
+
|
812 |
+
#DECOMPOSE ALL BAD TAGS
|
813 |
+
#--------------
|
814 |
+
#for c in ['warningbox', 'metadata', 'references', 'navbox', 'toc', 'catlinks']:
|
815 |
+
# for e in soup.body.find_all(class_=c):
|
816 |
+
# print('decomposed',e)
|
817 |
+
# e.decompose()
|
818 |
+
|
819 |
+
# DECOMPOSE INVISIBLE ELEMENTS
|
820 |
+
#for e in soup.body.find_all():
|
821 |
+
# if e.hidden:
|
822 |
+
# print('decomposed',e)
|
823 |
+
# e.decompose()
|
824 |
+
# else:
|
825 |
+
# if e.attrs is not None:
|
826 |
+
# #print(e)
|
827 |
+
# #print('-')
|
828 |
+
# style = e.get('style')
|
829 |
+
# if style and 'display' in style and 'none' in style:
|
830 |
+
# print('decomposed',e)
|
831 |
+
# e.decompose()
|
832 |
+
# #print(e, style)
|
833 |
+
#--------------
|
834 |
+
|
835 |
+
#print(soup.body)
|
836 |
+
|
837 |
+
# BOILERPLATE REMOVAL OPTIONS
|
838 |
+
#1. jusText
|
839 |
+
#text = justext.justext(html, justext.get_stoplist("English"))
|
840 |
+
#text = '\n'.join([paragraph.text for paragraph in text if not paragraph.is_boilerplate])
|
841 |
+
|
842 |
+
#2. boilerpy3
|
843 |
+
#html = soup.body
|
844 |
+
#text = extractor.get_content(soup.prettify())
|
845 |
+
|
846 |
+
#3. Just extracting from 'text tags' like p
|
847 |
+
#simple rules (does not work depending on website, like on artgallery.yale, anything without clear paragraphic style)
|
848 |
+
#text = get_text_p_tags(soup)
|
849 |
+
|
850 |
+
#4. NONE
|
851 |
+
text = soup.body.get_text(' ').strip() # NOT GETTING FROM THE WHOLE SOUP, JUST BODY TO AVOID TITLES
|
852 |
+
|
853 |
+
#POST PROCESSING
|
854 |
+
text = apply_manual_rules(text)
|
855 |
+
text = clean_text_line_by_line(text, ch_join = ' ', verb=verb, join=join)
|
856 |
+
|
857 |
+
if not text:
|
858 |
+
return 'No extractable text' if join else ['No extractable text']
|
859 |
+
else:
|
860 |
+
return text
|
861 |
+
i=0
|
862 |
+
print(i)
|
863 |
+
print(reference_html_df.url.iloc[i])
|
864 |
+
|
865 |
+
reference_html_df['extracted_sentences'] = reference_html_df.html.progress_apply(retrieve_text_from_html, join=False, verb=False)
|
866 |
+
|
867 |
+
join_ch = ' '
|
868 |
+
reference_html_df['extracted_text'] = reference_html_df.extracted_sentences.apply(lambda x : join_ch.join(x))
|
869 |
+
|
870 |
+
splitter = SentenceSplitter(language='en')
|
871 |
+
|
872 |
+
seg = pysbd.Segmenter(language="en", clean=False)
|
873 |
+
|
874 |
+
nlp = spacy.load("en_core_web_lg")
|
875 |
+
|
876 |
+
text = reference_html_df.loc[0,'extracted_text']
|
877 |
+
|
878 |
+
# OPTION 1
|
879 |
+
# This gets some things wrong, such as Smt.=Shrimati ending a sentence, or any
|
880 |
+
# initials like P. N. Nampoothiri or Lt. Col.
|
881 |
+
#sents = sent_tokenize(text)
|
882 |
+
|
883 |
+
# OPTION 2
|
884 |
+
# Also breaks titles and initials like above, but additionally gets parenthesis wrong, like
|
885 |
+
# Amma Maharani [break](queen mother) [break] of Travancore.
|
886 |
+
#sents = seg.segment(text)
|
887 |
+
|
888 |
+
# OPTION 3
|
889 |
+
# Same as above plus new ones, like breaking contractions (like m. for married)
|
890 |
+
#sents = splitter.split(text)
|
891 |
+
|
892 |
+
# OPTION 4
|
893 |
+
# By far the best option, makes way less of the mistakes above, but not none. So let's adopt a strategy so ease this.
|
894 |
+
sents = [s for s in nlp(text).sents]
|
895 |
+
|
896 |
+
|
897 |
+
reference_html_df['nlp_sentences'] = reference_html_df.extracted_text.progress_apply(lambda x : [str(s) for s in nlp(x).sents])
|
898 |
+
reference_html_df['nlp_sentences_slide_2'] = reference_html_df['nlp_sentences'].progress_apply(
|
899 |
+
lambda x : [' '.join([a,b]) for a,b in list(zip(x,x[1:]+['']))]
|
900 |
+
)
|
901 |
+
|
902 |
+
assert type(reference_html_df.loc[0,'nlp_sentences']) == list
|
903 |
+
assert type(reference_html_df.loc[0,'nlp_sentences'][0]) == str
|
904 |
+
assert type(reference_html_df.loc[0,'nlp_sentences_slide_2']) == list
|
905 |
+
assert type(reference_html_df.loc[0,'nlp_sentences_slide_2'][0]) == str
|
906 |
+
return reference_html_df
|
907 |
+
|
908 |
+
if __name__ == '__main__':
|
909 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
910 |
+
target_QID = 'Q3621696'
|
911 |
+
claimParser(target_QID) #save results in .db
|
912 |
+
filtered_df = propertyFiltering(target_QID) #update db and return dataframe after filtering
|
913 |
+
url_set = urlParser(target_QID) #from ref table in .db
|
914 |
+
html_set = htmlParser(url_set, target_QID) #Original html docs collection
|
915 |
+
try:
|
916 |
+
claim_text = claim2text(html_set) #Claims generation
|
917 |
+
html_text = html2text(html_set)
|
918 |
+
claim_text = claim_text.astype(str)
|
919 |
+
html_text = html_text.astype(str)
|
920 |
+
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
|
921 |
+
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
|
922 |
+
except Exception as e:
|
923 |
+
print(f"No accessible html documents")
|
924 |
+
|
925 |
+
|
926 |
+
conn.commit()
|
927 |
+
conn.close()
|
928 |
+
#augmented_df = textualAugmentation(filtered_df) #textual information augmentation including label, desc, and alias
|
929 |
+
|
app.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import Wikidata_Text_Parser as wtr
|
3 |
+
import sqlite3
|
4 |
+
import Prove_lite as prv
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
import os
|
8 |
+
|
9 |
+
def wtr_process(qid):
|
10 |
+
try:
|
11 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
12 |
+
target_QID = qid
|
13 |
+
query = f"SELECT * FROM {'claim_text'}"
|
14 |
+
df = pd.read_sql_query(query, conn)
|
15 |
+
if target_QID in df['entity_id'].unique():
|
16 |
+
pass
|
17 |
+
else:
|
18 |
+
wtr.claimParser(target_QID) #save results in .db
|
19 |
+
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
|
20 |
+
url_set = wtr.urlParser(target_QID) #from ref table in .db
|
21 |
+
html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
|
22 |
+
claim_text = wtr.claim2text(html_set) #Claims generation
|
23 |
+
html_text = wtr.html2text(html_set)
|
24 |
+
claim_text = claim_text.astype(str)
|
25 |
+
html_text = html_text.astype(str)
|
26 |
+
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
|
27 |
+
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
|
28 |
+
conn.commit()
|
29 |
+
query = f"""
|
30 |
+
SELECT
|
31 |
+
claim_text.entity_label,
|
32 |
+
claim_text.property_label,
|
33 |
+
claim_text.object_label,
|
34 |
+
html_text.url
|
35 |
+
FROM claim_text
|
36 |
+
INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
|
37 |
+
WHERE claim_text.entity_id = '{target_QID}'
|
38 |
+
"""
|
39 |
+
|
40 |
+
result_df = pd.read_sql_query(query, conn)
|
41 |
+
|
42 |
+
conn.commit()
|
43 |
+
conn.close()
|
44 |
+
|
45 |
+
return result_df
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
error_df = pd.DataFrame({'Error': [str(e)]})
|
49 |
+
return error_df
|
50 |
+
|
51 |
+
|
52 |
+
def prv_process(qid):
|
53 |
+
target_QID = qid
|
54 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
55 |
+
query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
|
56 |
+
claim_df = pd.read_sql_query(query, conn)
|
57 |
+
|
58 |
+
query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
|
59 |
+
reference_text_df = pd.read_sql_query(query, conn)
|
60 |
+
|
61 |
+
verbalised_claims_df_final = prv.verbalisation(claim_df)
|
62 |
+
|
63 |
+
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
|
64 |
+
def update_progress(curr_step, total_steps):
|
65 |
+
progress((curr_step + 1) / total_steps)
|
66 |
+
|
67 |
+
splited_sentences_from_html = prv.setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
|
68 |
+
|
69 |
+
BATCH_SIZE = 512
|
70 |
+
N_TOP_SENTENCES = 5
|
71 |
+
SCORE_THRESHOLD = 0
|
72 |
+
evidence_df = prv.evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
|
73 |
+
result = prv.textEntailment(evidence_df, SCORE_THRESHOLD)
|
74 |
+
display_df = prv.TableMaking(verbalised_claims_df_final, result)
|
75 |
+
conn.commit()
|
76 |
+
conn.close()
|
77 |
+
return display_df
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
with gr.Blocks() as demo:
|
82 |
+
print("gradio started!")
|
83 |
+
gr.Markdown(
|
84 |
+
"""
|
85 |
+
# Prove
|
86 |
+
This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
|
87 |
+
"""
|
88 |
+
)
|
89 |
+
inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q245247)")
|
90 |
+
out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
|
91 |
+
run_button_1 = gr.Button("Start parsing")
|
92 |
+
run_button_1.click(wtr_process, inp, out)
|
93 |
+
|
94 |
+
|
95 |
+
gr.Markdown(
|
96 |
+
"""
|
97 |
+
Pre-trained language models-based text entailment.
|
98 |
+
"""
|
99 |
+
)
|
100 |
+
out_2 = gr.HTML(label="Results")
|
101 |
+
run_button_2 = gr.Button("Start processing")
|
102 |
+
run_button_2.click(prv_process, inp, out_2)
|
103 |
+
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
#DB initialising
|
107 |
+
if os.path.isfile('wikidata_claims_refs_parsed.db') != True:
|
108 |
+
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
|
109 |
+
target_QID = 'Q115305900'
|
110 |
+
wtr.claimParser(target_QID) #save results in .db
|
111 |
+
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
|
112 |
+
url_set = wtr.urlParser(target_QID) #from ref table in .db
|
113 |
+
html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
|
114 |
+
claim_text = wtr.claim2text(html_set) #Claims generation
|
115 |
+
html_text = wtr.html2text(html_set)
|
116 |
+
claim_text = claim_text.astype(str)
|
117 |
+
html_text = html_text.astype(str)
|
118 |
+
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
|
119 |
+
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
|
120 |
+
conn.commit()
|
121 |
+
conn.close()
|
122 |
+
demo.launch(share=True)
|
llm_load copy.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import login
|
2 |
+
from unsloth import FastLanguageModel
|
3 |
+
import torch
|
4 |
+
from transformers import TextStreamer
|
5 |
+
|
6 |
+
def llmLoad(max_seq_length):
|
7 |
+
with open('API_key.txt', 'r') as file:
|
8 |
+
token = file.read().strip()
|
9 |
+
login(token=token)
|
10 |
+
|
11 |
+
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
12 |
+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
13 |
+
|
14 |
+
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
|
15 |
+
fourbit_models = [
|
16 |
+
"unsloth/mistral-7b-bnb-4bit",
|
17 |
+
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
|
18 |
+
"unsloth/llama-2-7b-bnb-4bit",
|
19 |
+
"unsloth/gemma-7b-bnb-4bit",
|
20 |
+
"unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
|
21 |
+
"unsloth/gemma-2b-bnb-4bit",
|
22 |
+
"unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
|
23 |
+
"unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
|
24 |
+
] # More models at https://huggingface.co/unsloth
|
25 |
+
|
26 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
27 |
+
model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
|
28 |
+
max_seq_length = max_seq_length,
|
29 |
+
dtype = dtype,
|
30 |
+
load_in_4bit = load_in_4bit,
|
31 |
+
)
|
32 |
+
return tokenizer, model
|
33 |
+
|
34 |
+
def llmQuestion(tokenizer, model, instruct, question, output_size):
|
35 |
+
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
|
36 |
+
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
37 |
+
|
38 |
+
### Instruction:
|
39 |
+
{}
|
40 |
+
|
41 |
+
### Input:
|
42 |
+
{}
|
43 |
+
|
44 |
+
### Response:
|
45 |
+
{}"""
|
46 |
+
|
47 |
+
# alpaca_prompt = Copied from above
|
48 |
+
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
|
49 |
+
inputs = tokenizer(
|
50 |
+
[
|
51 |
+
alpaca_prompt.format(
|
52 |
+
instruct, # instruction
|
53 |
+
question, # input
|
54 |
+
"", # output - leave this blank for generation!
|
55 |
+
)
|
56 |
+
], return_tensors = "pt").to("cuda")
|
57 |
+
|
58 |
+
|
59 |
+
outputs = model.generate(**inputs, max_new_tokens=output_size, use_cache=True)
|
60 |
+
output_text = tokenizer.batch_decode(outputs)[0].split('### Response:')[1]
|
61 |
+
|
62 |
+
return output_text
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
tokenizer, model = llmLoad(8192)
|
66 |
+
sentences = """['\n \n \n\t\t\t\n\t\t\t\n\t\t \n \n \n \n \n \n \n \n \n \n UK News Website of the Year\n \n \n The Telegraph logo\n \n \n \n \n ',
|
67 |
+
'\n \n \n \n \n Search Icon\n \n \n \n News \n Sport \n Money \n Travel \n ',
|
68 |
+
'Business \n Health \n Opinion \n General election \n Ukraine \n Royals \n Life & Style \n Culture \n ',
|
69 |
+
" Puzzles \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__navigation .e-site-header-button__link').forEach(link => {\n\t\t\t\tlink.addEventListener('click', (e) => {\n",
|
70 |
+
'\t\t\t\t\teVar94 = "header-search-icon-mobile";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmgComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n',
|
71 |
+
'\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n \n \n \n \n \n UK Edition \n \n \n ',
|
72 |
+
' \n \n \n US Edition \n \n \n \n \n \n \n \n Search Icon\n \n \n \n Subscribe now Free for one month',
|
73 |
+
' \n \n \n \n \n \n \n \n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tLog in\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n \n \n \n \n \n \n \n \n \n ',
|
74 |
+
' \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n Sections\n \n ',
|
75 |
+
' \n \n UK Edition \n \n \n \n \n \n US Edition \n \n \n \n \n \n \n',
|
76 |
+
' News\n \n \n \n \n News home \n UK news \n Politics \n World \n Health news \n Defe',
|
77 |
+
'nce \n Science \n Education \n Environment \n Investigations \n Global Health Security \n \n Sport\n \n \n ',
|
78 |
+
" \n \n Sport home \n Football \n Rugby Union \n Cricket \n F1 \n Golf \n Tennis \n Women's Sp",
|
79 |
+
'ort \n Racing \n Cycling \n Boxing \n More... \n \n Money\n \n \n \n \n Money home \n ',
|
80 |
+
' Property \n Tax \n Pensions \n Banking \n Investing \n Net Zero \n Calculators \n Guides \n \n ',
|
81 |
+
'Travel\n \n \n \n \n Travel home \n Europe \n UK \n Worldwide \n City breaks \n Hotels \n ',
|
82 |
+
' Cruise \n Ski \n Advice \n \n Business\n \n \n \n \n Business home \n Alex \n Ec',
|
83 |
+
'onomy \n Companies \n Markets \n Tech \n \n Health\n \n \n \n \n Health home \n Diet \n ',
|
84 |
+
' Fitness \n Conditions \n Wellbeing \n Parenting \n Guides \n Tools \n \n Opinion\n \n \n ',
|
85 |
+
' \n \n Opinion home \n Obituaries \n Letters to the Editor \n Telegraph View \n Our columnists \n Cartoons \n \n ',
|
86 |
+
' General election \n Ukraine\n \n \n \n \n Ukraine home \n Daily podcast \n Daily newsletter \n \n ',
|
87 |
+
' Royals\n \n \n \n \n Royals home \n King Charles III \n Queen Camilla \n Prince William \n Prince',
|
88 |
+
'ss of Wales \n Prince Harry \n Duchess of Sussex \n \n Life & Style\n \n \n \n \n Life & Style home \n ',
|
89 |
+
' Family \n Columnists \n Cookbook \n Food & Drink \n Fashion \n Beauty \n Luxury \n Cars \n Gardening \n ',
|
90 |
+
' Interiors \n Puzzle News \n Recommended \n Tel Mag \n \n Culture\n \n \n \n \n Culture hom',
|
91 |
+
'e \n TV \n Film \n Music \n Books \n Theatre \n Comedy \n Dance \n Opera \n Art \n \n ',
|
92 |
+
' Telegraph Tickets \n \n Puzzles \n \n \n \n \n \n \n \n UK Edition \n \n \n \n ',
|
93 |
+
' \n US Edition \n \n \n \n \n \n \n \n \n Subscribe now Free for one month \n \n \n ',
|
94 |
+
' \n \n \n \n \n \n \n \n Log in\n \n Login icon\n \n \n \n \n ',
|
95 |
+
' \n Follow us on:\n \n \n \n Facebook icon\n \n \n \n Instagram icon\n \n \n ',
|
96 |
+
' \n X icon\n \n \n \n Snapchat icon\n \n \n \n LinkedIn icon\n \n \n \n ',
|
97 |
+
' YouTube icon \n \n \n \n \n \n \n More from The Telegraph\n \n \n Download our app \n Newsletters \n ',
|
98 |
+
' Telegraph Extra \n Recommended \n Financial Solutions \n Events \n Betting \n Dating \n Offers \n Travel offers \n Shop \n ',
|
99 |
+
' Garden shop \n Bookshop \n Tickets \n Puzzles \n Fantasy Football \n Work at The Telegraph \n Telegraph Corporate \n Help and suppo',
|
100 |
+
'rt \n The Chelsea Magazine Company \n Broadband and Mobile Deals \n Voucher codes \n \n See top shops\n \n \n \n ',
|
101 |
+
' \n Samsung \n Nike \n ASOS \n eBay \n Currys \n Wayfair \n TUI \n JD Sports \n Travelodg',
|
102 |
+
'e \n Adidas \n Broadband deals \n Cheap broadband \n Broadband in my area \n Broadband and TV deals \n Mobile deals \n ',
|
103 |
+
" SIM-only deals \n \n \n \n \n \n \n \n \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__buttons .e-site-header-button__link').forE",
|
104 |
+
'ach(link => {\n\t\t\t\tlink.addEventListener(\'click\', (e) => {\n\t\t\t\t\teVar94 = "header-search-icon-desktop";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmg',
|
105 |
+
'ComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n\t\n\t\t\n\t\t\t\n\t\t Jump to navigation\n \n \n \n',
|
106 |
+
" \n \n \n \n \n \n Hitch Hiker's Guide author Douglas Adams dies aged 49\n \n \n \n \n By Andrew Alderson and Daniel Foggo 13 May 2001 • 12:00am \n \n \n ",
|
107 |
+
"\n \n \n \n DOUGLAS ADAMS, the thought-provoking author who inspired a generation with his cult science-fiction novel, The Hitch Hiker's Guide to the Galaxy, has died at the age of 4",
|
108 |
+
'9 from a heart attack while working out at the gym.\n \n \n \n \n \n \n \n \n \n \n Douglas Adams: inspired a generation with t',
|
109 |
+
'he cult novel, A Hitch Hiker\'s Guide to the Galaxy\n \n \n \n \n \n \n \n Adams\'s age was seven more than his cryptic answer of "42" to the intriguing ques',
|
110 |
+
'tion the comic novel had posed: what is the answer to life, the universe and everything? His book has sold more than 14 million copies worldwide, but Adams became a household name in Britain after it ',
|
111 |
+
'was turned into a BBC television series in the early 1980s.\n \n \n \n \n Adams, 6ft 5in tall and well built, did not have a history of heart problems. However, say friends, he',
|
112 |
+
' had visited the doctor just days ago complaining of a numbness in his arm. He collapsed on Friday while exercising at a gym in Santa Barbara on the west coast of America and never regained consciousn',
|
113 |
+
'ess. He leaves a widow and a six-year-old daughter.\n \n \n \n \n Adams was British but moved with his family to California in 1999, to be involved in a Disney film version of ',
|
114 |
+
'his book: he had previously lived in Islington, north London, for 22 years. A complex man, he was transported from obscurity to fame in 1979 by the instant success of his novel, which became hugely po',
|
115 |
+
'pular with students.\n \n \n \n \n Soon after the book was published, he was invited to sign copies at a small Soho bookshop. On his way there, Adams became convinced he was be',
|
116 |
+
'en caught up in a demonstration, only to discover the crowds were waiting for him.\n \n \n \n \n The book shot to the number one spot in the best-seller list the next day. He s',
|
117 |
+
'aid: "It was like being helicoptered to the top of Mount Everest, or having an orgasm without the foreplay." Adams, however, later suffered from writer\'s block and was so notoriously bad at meeting de',
|
118 |
+
"adlines that Sue Freestone, his former publisher, was even known to move into his house to bully him into writing.\n \n \n \n \n Ed Victor, Adams's literary agent for 20 years ",
|
119 |
+
'and a close friend, was devastated by the news yesterday. He said: "I feel as if someone has torn a limb off me. Tragic is an overused word, but this really is a tragic loss.\n \n \n \n ',
|
120 |
+
' \n Mr Victor said: "He was one of the truly original writers and thinkers of our generation who should have had many years ahead of him. He was not only entertaining, but also stimulating an',
|
121 |
+
'd provoking: he was a unique thinker with a huge audience."\n \n \n \n \n Mr Victor said that writer\'s block had been a terrible problem for Adams, who hated spending time alon',
|
122 |
+
'e. He said: "He was once locked in a hotel suite at the Berkeley for two weeks by Sonny Mehta [his former publisher]. When I asked Douglas how it had worked, he said: \'It was simple. I sat at the desk',
|
123 |
+
' and typed and Sonny sat in an armchair and glowered.\' "\n \n \n \n \n Adams was said to have used The Hitch Hiker\'s Guide, which started off as a radio show in the 1970s, to p',
|
124 |
+
'oke fun at those who seek solutions to unanswerable questions. It was intended to highlight the absurdity of attempting to do so.\n \n \n \n \n The novel has since been turned ',
|
125 |
+
'into a play and a computer game, and has spawned four sequels. Adams also set up a website called h2g2, an entertainment guide now run by the BBC, as a spin-off from his book.\n \n \n \n ',
|
126 |
+
' \n In his novel, which deals with the voyages of a suburban earthling, Arthur Dent, Adams describes a race of hyper-intelligent beings, who had reached a point where they were determined to',
|
127 |
+
' understand the purpose of the universe and their own existence.\n \n \n \n \n They built a supercomputer, Deep Thought, and asked it for the answer to the ultimate question of',
|
128 |
+
' life, the universe and everything. The computer worked for several millennia on the answer. Finally, the beings were shocked and disappointed with the computer\'s ridiculous response: "42".\n \n ',
|
129 |
+
' \n \n \n In the book, the Earth is referred to as "mostly harmless", which became a buzz phrase of the 1980s. Adams was born in Cambridge in 1952 and educated at Brentwood School, E',
|
130 |
+
"ssex, before returning to Cambridge to study at St John's College.\n \n \n \n \n His early career included work as a radio and television writer and producer. Some of his early",
|
131 |
+
" writing was with his friend Graham Chapman, a member of the Monty Python's Flying Circus comedy team.\n \n \n \n \n He later collaborated with Terry Jones, another Python team",
|
132 |
+
' member. Jones was in tears after learning of his friend\'s death yesterday. He told the Telegraph: "Douglas was a total original: he had a beautiful way of thinking and an incisive mind that went stra',
|
133 |
+
'ight to the heart of matters. He had a genius for putting those concepts into words. His books were great works of literature. He was a lovely man, and I loved him."\n \n \n \n \n ',
|
134 |
+
' Senior staff at the BBC, who worked with Adams, were equally sad. Alan Yentob, the corporation\'s director of drama and entertainment, said: "Douglas was a big character who will be hugely missed b',
|
135 |
+
'y a host of friends and millions of fans around the world."\n \n \n \n \n Geoffrey Perkins, the BBC\'s head of comedy and who produced the original radio series of the novel, sa',
|
136 |
+
'id: "I\'ve known Douglas for 25 years. He was absolutely one of the most creative geniuses to ever work in radio comedy."\n \n \n \n \n Adams\'s life was transformed by the publi',
|
137 |
+
"cation of The Hitch Hiker's Guide providing him with a wealth he had never imagined. He married Jane Belson, a barrister, in 1991 and they had a daughter, Polly, in 1994.\n \n \n \n ",
|
138 |
+
"\n Adams's other bestselling titles include The Restaurant at the End of the Universe; Life, the Universe and Everything and So Long, and Thanks for All the Fish. He was in discussion to turn an",
|
139 |
+
"other of his books, Dirk Gently's Holistic Detective Agency, into a film and was working on another novel, which was 12 years late.\n \n \n \n \n \n \n \n \n \n ",
|
140 |
+
' \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n WhatsApp Icon\n \n \n \n Email Icon\n ',
|
141 |
+
' \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n Advertisement\n \n \n \n\n\tMore stories\n\n\n',
|
142 |
+
'\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n Whats',
|
143 |
+
'App Icon\n \n \n \n Email Icon\n \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n \n\n\tMore from The T',
|
144 |
+
'elegraph\n\n\n\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\t\tMore stories\n\t\t\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t',
|
145 |
+
'\n\n\t\t\n\n\t\n\n\n\n\t\t\n\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n',
|
146 |
+
'\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tProtesters charged after blocking coach bound ',
|
147 |
+
'for Bibby Stockholm \n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n',
|
148 |
+
'\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tTelegraph Reporters\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:53am\n\t\t\t\t\t\t\t\n',
|
149 |
+
'\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t',
|
150 |
+
'\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t',
|
151 |
+
'\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t',
|
152 |
+
'\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tCanada police lay charges over murder of Sikh leader and probe Indian ties\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t',
|
153 |
+
'\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tOur F',
|
154 |
+
'oreign Staff\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:12am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t',
|
155 |
+
'\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t',
|
156 |
+
'\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n',
|
157 |
+
'\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n',
|
158 |
+
'\t\t\t\t\n\n\t\t\t\tKing takes on hundreds of new patronages\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t',
|
159 |
+
'\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tVictoria Ward\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 M',
|
160 |
+
'ay 2024, 12:01am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n',
|
161 |
+
'\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t',
|
162 |
+
'\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t',
|
163 |
+
'\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLabour’s strategy ‘won’t last’ into a general election, says Cabinet minister\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n',
|
164 |
+
'\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t',
|
165 |
+
'\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tJack Maidment\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 11:01pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t',
|
166 |
+
'\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n',
|
167 |
+
'\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t',
|
168 |
+
'\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t',
|
169 |
+
'\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLuton waste chance to start great escape in draw with Everton\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t',
|
170 |
+
'\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tWill Conroy\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n',
|
171 |
+
'\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:53pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
|
172 |
+
'\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
|
173 |
+
'\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n',
|
174 |
+
'\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tSeven things you may have missed in the local elections\n\n\t\t\t',
|
175 |
+
'\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t',
|
176 |
+
'\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tDominic Penna\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:37pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
|
177 |
+
'\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n',
|
178 |
+
'\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
|
179 |
+
'\n\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n The Telegraph\n \n \n \n Back to top\n \n \n \n \n \n Follow us on:\n \n \n ',
|
180 |
+
' \n Facebook icon\n \n \n \n Instagram icon\n \n \n \n X icon\n \n \n \n Snapchat icon\n \n \n',
|
181 |
+
' \n LinkedIn icon\n \n \n \n YouTube icon \n \n \n \n \n \n \n \n \n \n Help Centre\n About us\n Telegraph Extra\n ',
|
182 |
+
' Reader Prints\n Branded Content\n Syndication and Commissioning\n Fantasy Sport\n UK Voucher Codes\n Betting Offers\n Tax Strategy\n Broadband and Mobile Deals\n',
|
183 |
+
' The Chelsea Magazine Company\n Newsletters\n Download the Telegraph App\n Privacy\n Terms & Conditions\n Modern Slavery\n Advertising terms\n Guidelines\n ',
|
184 |
+
" \n \n © Telegraph Media Group Limited 2024\n \n \n \n \n \n \n\twindow.addEventListener( 'DOMContentLoaded', function() {\n\t\t_satellite.pageBottom();\n\t});\n\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\n\t \n\t\t\n\t\t\t\n\t",
|
185 |
+
"\t\t\t\n\t\t\t\n\t\n window.RUM_BASE = '/';\nimport { sampleRUM } from '/.rum/@adobe/helix-rum-js@^1/src/index.js';\nsampleRUM('lazy');\nsampleRUM('cwv');\n\n "]"""
|
186 |
+
instruct = "Find relevant sentences from text_dump with given the target sentence"
|
187 |
+
question = f"target sentence:'Adam douglas was born in Cambrige', text_dump:{sentences}"
|
188 |
+
answer = llmQuestion(tokenizer, model, instruct, question, 8192, 8192)
|
llm_load.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import login
|
2 |
+
from unsloth import FastLanguageModel
|
3 |
+
import torch
|
4 |
+
from transformers import TextStreamer
|
5 |
+
|
6 |
+
def llmLoad(max_seq_length):
|
7 |
+
with open('API_key.txt', 'r') as file:
|
8 |
+
token = file.read().strip()
|
9 |
+
login(token=token)
|
10 |
+
|
11 |
+
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
12 |
+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
13 |
+
|
14 |
+
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
|
15 |
+
fourbit_models = [
|
16 |
+
"unsloth/mistral-7b-bnb-4bit",
|
17 |
+
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
|
18 |
+
"unsloth/llama-2-7b-bnb-4bit",
|
19 |
+
"unsloth/gemma-7b-bnb-4bit",
|
20 |
+
"unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
|
21 |
+
"unsloth/gemma-2b-bnb-4bit",
|
22 |
+
"unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
|
23 |
+
"unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
|
24 |
+
] # More models at https://huggingface.co/unsloth
|
25 |
+
|
26 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
27 |
+
model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
|
28 |
+
max_seq_length = max_seq_length,
|
29 |
+
dtype = dtype,
|
30 |
+
load_in_4bit = load_in_4bit,
|
31 |
+
)
|
32 |
+
return tokenizer, model
|
33 |
+
|
34 |
+
def llmQuestion(tokenizer, model, instruct, question, output_size):
|
35 |
+
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
|
36 |
+
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
37 |
+
|
38 |
+
### Instruction:
|
39 |
+
{}
|
40 |
+
|
41 |
+
### Input:
|
42 |
+
{}
|
43 |
+
|
44 |
+
### Response:
|
45 |
+
{}"""
|
46 |
+
|
47 |
+
# alpaca_prompt = Copied from above
|
48 |
+
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
|
49 |
+
inputs = tokenizer(
|
50 |
+
[
|
51 |
+
alpaca_prompt.format(
|
52 |
+
instruct, # instruction
|
53 |
+
question, # input
|
54 |
+
"", # output - leave this blank for generation!
|
55 |
+
)
|
56 |
+
], return_tensors = "pt").to("cuda")
|
57 |
+
|
58 |
+
|
59 |
+
outputs = model.generate(**inputs, max_new_tokens=output_size, use_cache=True)
|
60 |
+
output_text = tokenizer.batch_decode(outputs)[0].split('### Response:')[1]
|
61 |
+
|
62 |
+
return output_text
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
tokenizer, model = llmLoad(8192)
|
66 |
+
sentences = """['\n \n \n\t\t\t\n\t\t\t\n\t\t \n \n \n \n \n \n \n \n \n \n UK News Website of the Year\n \n \n The Telegraph logo\n \n \n \n \n ',
|
67 |
+
'\n \n \n \n \n Search Icon\n \n \n \n News \n Sport \n Money \n Travel \n ',
|
68 |
+
'Business \n Health \n Opinion \n General election \n Ukraine \n Royals \n Life & Style \n Culture \n ',
|
69 |
+
" Puzzles \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__navigation .e-site-header-button__link').forEach(link => {\n\t\t\t\tlink.addEventListener('click', (e) => {\n",
|
70 |
+
'\t\t\t\t\teVar94 = "header-search-icon-mobile";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmgComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n',
|
71 |
+
'\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n \n \n \n \n \n UK Edition \n \n \n ',
|
72 |
+
' \n \n \n US Edition \n \n \n \n \n \n \n \n Search Icon\n \n \n \n Subscribe now Free for one month',
|
73 |
+
' \n \n \n \n \n \n \n \n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tLog in\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n \n \n \n \n \n \n \n \n \n ',
|
74 |
+
' \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n Sections\n \n ',
|
75 |
+
' \n \n UK Edition \n \n \n \n \n \n US Edition \n \n \n \n \n \n \n',
|
76 |
+
' News\n \n \n \n \n News home \n UK news \n Politics \n World \n Health news \n Defe',
|
77 |
+
'nce \n Science \n Education \n Environment \n Investigations \n Global Health Security \n \n Sport\n \n \n ',
|
78 |
+
" \n \n Sport home \n Football \n Rugby Union \n Cricket \n F1 \n Golf \n Tennis \n Women's Sp",
|
79 |
+
'ort \n Racing \n Cycling \n Boxing \n More... \n \n Money\n \n \n \n \n Money home \n ',
|
80 |
+
' Property \n Tax \n Pensions \n Banking \n Investing \n Net Zero \n Calculators \n Guides \n \n ',
|
81 |
+
'Travel\n \n \n \n \n Travel home \n Europe \n UK \n Worldwide \n City breaks \n Hotels \n ',
|
82 |
+
' Cruise \n Ski \n Advice \n \n Business\n \n \n \n \n Business home \n Alex \n Ec',
|
83 |
+
'onomy \n Companies \n Markets \n Tech \n \n Health\n \n \n \n \n Health home \n Diet \n ',
|
84 |
+
' Fitness \n Conditions \n Wellbeing \n Parenting \n Guides \n Tools \n \n Opinion\n \n \n ',
|
85 |
+
' \n \n Opinion home \n Obituaries \n Letters to the Editor \n Telegraph View \n Our columnists \n Cartoons \n \n ',
|
86 |
+
' General election \n Ukraine\n \n \n \n \n Ukraine home \n Daily podcast \n Daily newsletter \n \n ',
|
87 |
+
' Royals\n \n \n \n \n Royals home \n King Charles III \n Queen Camilla \n Prince William \n Prince',
|
88 |
+
'ss of Wales \n Prince Harry \n Duchess of Sussex \n \n Life & Style\n \n \n \n \n Life & Style home \n ',
|
89 |
+
' Family \n Columnists \n Cookbook \n Food & Drink \n Fashion \n Beauty \n Luxury \n Cars \n Gardening \n ',
|
90 |
+
' Interiors \n Puzzle News \n Recommended \n Tel Mag \n \n Culture\n \n \n \n \n Culture hom',
|
91 |
+
'e \n TV \n Film \n Music \n Books \n Theatre \n Comedy \n Dance \n Opera \n Art \n \n ',
|
92 |
+
' Telegraph Tickets \n \n Puzzles \n \n \n \n \n \n \n \n UK Edition \n \n \n \n ',
|
93 |
+
' \n US Edition \n \n \n \n \n \n \n \n \n Subscribe now Free for one month \n \n \n ',
|
94 |
+
' \n \n \n \n \n \n \n \n Log in\n \n Login icon\n \n \n \n \n ',
|
95 |
+
' \n Follow us on:\n \n \n \n Facebook icon\n \n \n \n Instagram icon\n \n \n ',
|
96 |
+
' \n X icon\n \n \n \n Snapchat icon\n \n \n \n LinkedIn icon\n \n \n \n ',
|
97 |
+
' YouTube icon \n \n \n \n \n \n \n More from The Telegraph\n \n \n Download our app \n Newsletters \n ',
|
98 |
+
' Telegraph Extra \n Recommended \n Financial Solutions \n Events \n Betting \n Dating \n Offers \n Travel offers \n Shop \n ',
|
99 |
+
' Garden shop \n Bookshop \n Tickets \n Puzzles \n Fantasy Football \n Work at The Telegraph \n Telegraph Corporate \n Help and suppo',
|
100 |
+
'rt \n The Chelsea Magazine Company \n Broadband and Mobile Deals \n Voucher codes \n \n See top shops\n \n \n \n ',
|
101 |
+
' \n Samsung \n Nike \n ASOS \n eBay \n Currys \n Wayfair \n TUI \n JD Sports \n Travelodg',
|
102 |
+
'e \n Adidas \n Broadband deals \n Cheap broadband \n Broadband in my area \n Broadband and TV deals \n Mobile deals \n ',
|
103 |
+
" SIM-only deals \n \n \n \n \n \n \n \n \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__buttons .e-site-header-button__link').forE",
|
104 |
+
'ach(link => {\n\t\t\t\tlink.addEventListener(\'click\', (e) => {\n\t\t\t\t\teVar94 = "header-search-icon-desktop";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmg',
|
105 |
+
'ComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n\t\n\t\t\n\t\t\t\n\t\t Jump to navigation\n \n \n \n',
|
106 |
+
" \n \n \n \n \n \n Hitch Hiker's Guide author Douglas Adams dies aged 49\n \n \n \n \n By Andrew Alderson and Daniel Foggo 13 May 2001 • 12:00am \n \n \n ",
|
107 |
+
"\n \n \n \n DOUGLAS ADAMS, the thought-provoking author who inspired a generation with his cult science-fiction novel, The Hitch Hiker's Guide to the Galaxy, has died at the age of 4",
|
108 |
+
'9 from a heart attack while working out at the gym.\n \n \n \n \n \n \n \n \n \n \n Douglas Adams: inspired a generation with t',
|
109 |
+
'he cult novel, A Hitch Hiker\'s Guide to the Galaxy\n \n \n \n \n \n \n \n Adams\'s age was seven more than his cryptic answer of "42" to the intriguing ques',
|
110 |
+
'tion the comic novel had posed: what is the answer to life, the universe and everything? His book has sold more than 14 million copies worldwide, but Adams became a household name in Britain after it ',
|
111 |
+
'was turned into a BBC television series in the early 1980s.\n \n \n \n \n Adams, 6ft 5in tall and well built, did not have a history of heart problems. However, say friends, he',
|
112 |
+
' had visited the doctor just days ago complaining of a numbness in his arm. He collapsed on Friday while exercising at a gym in Santa Barbara on the west coast of America and never regained consciousn',
|
113 |
+
'ess. He leaves a widow and a six-year-old daughter.\n \n \n \n \n Adams was British but moved with his family to California in 1999, to be involved in a Disney film version of ',
|
114 |
+
'his book: he had previously lived in Islington, north London, for 22 years. A complex man, he was transported from obscurity to fame in 1979 by the instant success of his novel, which became hugely po',
|
115 |
+
'pular with students.\n \n \n \n \n Soon after the book was published, he was invited to sign copies at a small Soho bookshop. On his way there, Adams became convinced he was be',
|
116 |
+
'en caught up in a demonstration, only to discover the crowds were waiting for him.\n \n \n \n \n The book shot to the number one spot in the best-seller list the next day. He s',
|
117 |
+
'aid: "It was like being helicoptered to the top of Mount Everest, or having an orgasm without the foreplay." Adams, however, later suffered from writer\'s block and was so notoriously bad at meeting de',
|
118 |
+
"adlines that Sue Freestone, his former publisher, was even known to move into his house to bully him into writing.\n \n \n \n \n Ed Victor, Adams's literary agent for 20 years ",
|
119 |
+
'and a close friend, was devastated by the news yesterday. He said: "I feel as if someone has torn a limb off me. Tragic is an overused word, but this really is a tragic loss.\n \n \n \n ',
|
120 |
+
' \n Mr Victor said: "He was one of the truly original writers and thinkers of our generation who should have had many years ahead of him. He was not only entertaining, but also stimulating an',
|
121 |
+
'd provoking: he was a unique thinker with a huge audience."\n \n \n \n \n Mr Victor said that writer\'s block had been a terrible problem for Adams, who hated spending time alon',
|
122 |
+
'e. He said: "He was once locked in a hotel suite at the Berkeley for two weeks by Sonny Mehta [his former publisher]. When I asked Douglas how it had worked, he said: \'It was simple. I sat at the desk',
|
123 |
+
' and typed and Sonny sat in an armchair and glowered.\' "\n \n \n \n \n Adams was said to have used The Hitch Hiker\'s Guide, which started off as a radio show in the 1970s, to p',
|
124 |
+
'oke fun at those who seek solutions to unanswerable questions. It was intended to highlight the absurdity of attempting to do so.\n \n \n \n \n The novel has since been turned ',
|
125 |
+
'into a play and a computer game, and has spawned four sequels. Adams also set up a website called h2g2, an entertainment guide now run by the BBC, as a spin-off from his book.\n \n \n \n ',
|
126 |
+
' \n In his novel, which deals with the voyages of a suburban earthling, Arthur Dent, Adams describes a race of hyper-intelligent beings, who had reached a point where they were determined to',
|
127 |
+
' understand the purpose of the universe and their own existence.\n \n \n \n \n They built a supercomputer, Deep Thought, and asked it for the answer to the ultimate question of',
|
128 |
+
' life, the universe and everything. The computer worked for several millennia on the answer. Finally, the beings were shocked and disappointed with the computer\'s ridiculous response: "42".\n \n ',
|
129 |
+
' \n \n \n In the book, the Earth is referred to as "mostly harmless", which became a buzz phrase of the 1980s. Adams was born in Cambridge in 1952 and educated at Brentwood School, E',
|
130 |
+
"ssex, before returning to Cambridge to study at St John's College.\n \n \n \n \n His early career included work as a radio and television writer and producer. Some of his early",
|
131 |
+
" writing was with his friend Graham Chapman, a member of the Monty Python's Flying Circus comedy team.\n \n \n \n \n He later collaborated with Terry Jones, another Python team",
|
132 |
+
' member. Jones was in tears after learning of his friend\'s death yesterday. He told the Telegraph: "Douglas was a total original: he had a beautiful way of thinking and an incisive mind that went stra',
|
133 |
+
'ight to the heart of matters. He had a genius for putting those concepts into words. His books were great works of literature. He was a lovely man, and I loved him."\n \n \n \n \n ',
|
134 |
+
' Senior staff at the BBC, who worked with Adams, were equally sad. Alan Yentob, the corporation\'s director of drama and entertainment, said: "Douglas was a big character who will be hugely missed b',
|
135 |
+
'y a host of friends and millions of fans around the world."\n \n \n \n \n Geoffrey Perkins, the BBC\'s head of comedy and who produced the original radio series of the novel, sa',
|
136 |
+
'id: "I\'ve known Douglas for 25 years. He was absolutely one of the most creative geniuses to ever work in radio comedy."\n \n \n \n \n Adams\'s life was transformed by the publi',
|
137 |
+
"cation of The Hitch Hiker's Guide providing him with a wealth he had never imagined. He married Jane Belson, a barrister, in 1991 and they had a daughter, Polly, in 1994.\n \n \n \n ",
|
138 |
+
"\n Adams's other bestselling titles include The Restaurant at the End of the Universe; Life, the Universe and Everything and So Long, and Thanks for All the Fish. He was in discussion to turn an",
|
139 |
+
"other of his books, Dirk Gently's Holistic Detective Agency, into a film and was working on another novel, which was 12 years late.\n \n \n \n \n \n \n \n \n \n ",
|
140 |
+
' \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n WhatsApp Icon\n \n \n \n Email Icon\n ',
|
141 |
+
' \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n Advertisement\n \n \n \n\n\tMore stories\n\n\n',
|
142 |
+
'\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n Whats',
|
143 |
+
'App Icon\n \n \n \n Email Icon\n \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n \n\n\tMore from The T',
|
144 |
+
'elegraph\n\n\n\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\t\tMore stories\n\t\t\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t',
|
145 |
+
'\n\n\t\t\n\n\t\n\n\n\n\t\t\n\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n',
|
146 |
+
'\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tProtesters charged after blocking coach bound ',
|
147 |
+
'for Bibby Stockholm \n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n',
|
148 |
+
'\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tTelegraph Reporters\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:53am\n\t\t\t\t\t\t\t\n',
|
149 |
+
'\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t',
|
150 |
+
'\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t',
|
151 |
+
'\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t',
|
152 |
+
'\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tCanada police lay charges over murder of Sikh leader and probe Indian ties\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t',
|
153 |
+
'\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tOur F',
|
154 |
+
'oreign Staff\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:12am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t',
|
155 |
+
'\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t',
|
156 |
+
'\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n',
|
157 |
+
'\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n',
|
158 |
+
'\t\t\t\t\n\n\t\t\t\tKing takes on hundreds of new patronages\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t',
|
159 |
+
'\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tVictoria Ward\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 M',
|
160 |
+
'ay 2024, 12:01am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n',
|
161 |
+
'\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t',
|
162 |
+
'\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t',
|
163 |
+
'\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLabour’s strategy ‘won’t last’ into a general election, says Cabinet minister\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n',
|
164 |
+
'\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t',
|
165 |
+
'\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tJack Maidment\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 11:01pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t',
|
166 |
+
'\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n',
|
167 |
+
'\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t',
|
168 |
+
'\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t',
|
169 |
+
'\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLuton waste chance to start great escape in draw with Everton\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t',
|
170 |
+
'\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tWill Conroy\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n',
|
171 |
+
'\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:53pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
|
172 |
+
'\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
|
173 |
+
'\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n',
|
174 |
+
'\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tSeven things you may have missed in the local elections\n\n\t\t\t',
|
175 |
+
'\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t',
|
176 |
+
'\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tDominic Penna\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:37pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
|
177 |
+
'\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n',
|
178 |
+
'\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
|
179 |
+
'\n\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n The Telegraph\n \n \n \n Back to top\n \n \n \n \n \n Follow us on:\n \n \n ',
|
180 |
+
' \n Facebook icon\n \n \n \n Instagram icon\n \n \n \n X icon\n \n \n \n Snapchat icon\n \n \n',
|
181 |
+
' \n LinkedIn icon\n \n \n \n YouTube icon \n \n \n \n \n \n \n \n \n \n Help Centre\n About us\n Telegraph Extra\n ',
|
182 |
+
' Reader Prints\n Branded Content\n Syndication and Commissioning\n Fantasy Sport\n UK Voucher Codes\n Betting Offers\n Tax Strategy\n Broadband and Mobile Deals\n',
|
183 |
+
' The Chelsea Magazine Company\n Newsletters\n Download the Telegraph App\n Privacy\n Terms & Conditions\n Modern Slavery\n Advertising terms\n Guidelines\n ',
|
184 |
+
" \n \n © Telegraph Media Group Limited 2024\n \n \n \n \n \n \n\twindow.addEventListener( 'DOMContentLoaded', function() {\n\t\t_satellite.pageBottom();\n\t});\n\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\n\t \n\t\t\n\t\t\t\n\t",
|
185 |
+
"\t\t\t\n\t\t\t\n\t\n window.RUM_BASE = '/';\nimport { sampleRUM } from '/.rum/@adobe/helix-rum-js@^1/src/index.js';\nsampleRUM('lazy');\nsampleRUM('cwv');\n\n "]"""
|
186 |
+
instruct = "Find relevant sentences from text_dump with given the target sentence"
|
187 |
+
question = f"target sentence:'Adam douglas was born in Cambrige', text_dump:{sentences}"
|
188 |
+
answer = llmQuestion(tokenizer, model, instruct, question, 8192, 8192)
|
requirements.txt
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
aiohttp==3.9.5
|
3 |
+
aiosignal==1.3.1
|
4 |
+
annotated-types==0.6.0
|
5 |
+
appnope==0.1.4
|
6 |
+
asttokens==2.4.1
|
7 |
+
async-timeout==4.0.3
|
8 |
+
attrs==23.2.0
|
9 |
+
beautifulsoup4==4.12.3
|
10 |
+
blis==0.7.11
|
11 |
+
boto3==1.34.95
|
12 |
+
botocore==1.34.95
|
13 |
+
bs4==0.0.2
|
14 |
+
callbacks==0.3.0
|
15 |
+
catalogue==2.0.10
|
16 |
+
certifi==2024.2.2
|
17 |
+
charset-normalizer==3.3.2
|
18 |
+
click==8.1.7
|
19 |
+
cloudpathlib==0.16.0
|
20 |
+
colorama==0.4.6
|
21 |
+
comm==0.2.2
|
22 |
+
confection==0.1.4
|
23 |
+
cymem==2.0.8
|
24 |
+
debugpy==1.8.1
|
25 |
+
decorator==5.1.1
|
26 |
+
exceptiongroup==1.2.1
|
27 |
+
executing==2.0.1
|
28 |
+
fasttext==0.9.2
|
29 |
+
filelock==3.14.0
|
30 |
+
frozenlist==1.4.1
|
31 |
+
fsspec==2024.3.1
|
32 |
+
html5lib==1.1
|
33 |
+
huggingface-hub==0.22.2
|
34 |
+
idna==3.7
|
35 |
+
importlib_metadata==7.1.0
|
36 |
+
ipykernel==6.29.4
|
37 |
+
ipython==8.18.1
|
38 |
+
jedi==0.19.1
|
39 |
+
Jinja2==3.1.3
|
40 |
+
jmespath==1.0.1
|
41 |
+
joblib==1.4.0
|
42 |
+
jupyter_client==8.6.1
|
43 |
+
jupyter_core==5.7.2
|
44 |
+
langcodes==3.4.0
|
45 |
+
language_data==1.2.0
|
46 |
+
Levenshtein==0.25.1
|
47 |
+
lightning-utilities==0.11.2
|
48 |
+
lxml==5.2.1
|
49 |
+
marisa-trie==1.1.0
|
50 |
+
MarkupSafe==2.1.5
|
51 |
+
matplotlib-inline==0.1.7
|
52 |
+
mpmath==1.3.0
|
53 |
+
multidict==6.0.5
|
54 |
+
murmurhash==1.0.10
|
55 |
+
mypy-extensions==1.0.0
|
56 |
+
nest-asyncio==1.6.0
|
57 |
+
networkx==3.2.1
|
58 |
+
nltk==3.8.1
|
59 |
+
numpy==1.26.4
|
60 |
+
packaging==24.0
|
61 |
+
pandas==2.2.2
|
62 |
+
parso==0.8.4
|
63 |
+
pexpect==4.9.0
|
64 |
+
platformdirs==4.2.1
|
65 |
+
portalocker==2.8.2
|
66 |
+
preshed==3.0.9
|
67 |
+
prompt-toolkit==3.0.43
|
68 |
+
psutil==5.9.8
|
69 |
+
ptyprocess==0.7.0
|
70 |
+
pure-eval==0.2.2
|
71 |
+
pybind11==2.12.0
|
72 |
+
pydantic==2.7.1
|
73 |
+
pydantic_core==2.18.2
|
74 |
+
Pygments==2.17.2
|
75 |
+
pysbd==0.3.4
|
76 |
+
python-dateutil==2.9.0.post0
|
77 |
+
pytorch-lightning==2.2.3
|
78 |
+
pytz==2024.1
|
79 |
+
PyYAML==6.0.1
|
80 |
+
pyzmq==26.0.2
|
81 |
+
qwikidata==0.4.2
|
82 |
+
rapidfuzz==3.8.1
|
83 |
+
regex==2024.4.28
|
84 |
+
requests==2.31.0
|
85 |
+
rouge_score==0.1.2
|
86 |
+
s3transfer==0.10.1
|
87 |
+
sacrebleu==2.4.2
|
88 |
+
safetensors==0.4.3
|
89 |
+
sentence-splitter==1.4
|
90 |
+
six==1.16.0
|
91 |
+
smart-open==6.4.0
|
92 |
+
soupsieve==2.5
|
93 |
+
spacy==3.7.4
|
94 |
+
spacy-legacy==3.0.12
|
95 |
+
spacy-loggers==1.0.5
|
96 |
+
srsly==2.4.8
|
97 |
+
stack-data==0.6.3
|
98 |
+
sympy==1.12
|
99 |
+
tabulate==0.9.0
|
100 |
+
thinc==8.2.3
|
101 |
+
tokenizers==0.19.1
|
102 |
+
torch==2.2.2
|
103 |
+
torchmetrics==1.3.2
|
104 |
+
tornado==6.4
|
105 |
+
tqdm==4.66.2
|
106 |
+
traitlets==5.14.3
|
107 |
+
transformers==4.40.1
|
108 |
+
typer==0.9.4
|
109 |
+
typing_extensions==4.11.0
|
110 |
+
tzdata==2024.1
|
111 |
+
urllib3==1.26.18
|
112 |
+
wasabi==1.1.2
|
113 |
+
wcwidth==0.2.13
|
114 |
+
weasel==0.3.4
|
115 |
+
webencodings==0.5.1
|
116 |
+
yarl==1.9.4
|
117 |
+
zipp==3.18.1
|
118 |
+
sentence_transformers
|