Spaces:
Sleeping
Sleeping
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
def extract_div_contents_from_url(url): | |
response = requests.get(url) | |
if response.status_code != 200: | |
print(f"Error: Received status code {response.status_code} for URL: {url}") | |
return pd.DataFrame(columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
div_classes = ["mw-heading mw-heading3",'boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk'] | |
divs = [] | |
for div_class in div_classes: | |
divs.extend(soup.find_all('div', class_=div_class)) | |
url_fragment = url.split('#')[-1].replace('_', ' ') | |
data = [] | |
for div in divs: | |
try: | |
title = None | |
text_url = None | |
# Extract title and text_url | |
title_tag = div.find('a') | |
if title_tag: | |
title_span = div.find('span', {'data-mw-comment-start': True}) | |
if title_span: | |
title_anchor = title_span.find_next_sibling('a') | |
if title_anchor: | |
title = title_anchor.text | |
text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
else: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
if title == 'talk page' or title is None: | |
heading_tag = div.find('div', class_='mw-heading mw-heading3') | |
if heading_tag: | |
title_tag = heading_tag.find('a') | |
if title_tag: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
if not title: | |
continue | |
if title.lower() != url_fragment.lower(): | |
continue | |
deletion_discussion = div.prettify() | |
# Extract label | |
label = '' | |
verdict_tag = div.find('p') | |
if verdict_tag: | |
label_b_tag = verdict_tag.find('b') | |
if label_b_tag: | |
label = label_b_tag.text.strip() | |
# Extract confirmation | |
confirmation = '' | |
discussion_tag = div.find('dd') | |
if discussion_tag: | |
discussion_tag_i = discussion_tag.find('i') | |
if discussion_tag_i: | |
confirmation_b_tag = discussion_tag_i.find('b') | |
if confirmation_b_tag: | |
confirmation = confirmation_b_tag.text.strip() | |
# Split deletion_discussion into discussion and verdict | |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
discussion = parts[0] if len(parts) > 0 else '' | |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
data.append([title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) | |
except Exception as e: | |
print(f"Error processing div: {e}") | |
continue | |
df = pd.DataFrame(data, columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion']) | |
df = df[['title', 'discussion', 'verdict', 'label']] | |
print(f"DataFrame created with {len(df)} rows") | |
return df | |
def extract_div_contents_from_url_new(url): | |
response = requests.get(url) | |
if response.status_code != 200: | |
print(f"Error: Received status code {response.status_code} for URL: {url}") | |
return pd.DataFrame(columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
div_classes = ["mw-heading mw-heading3"] | |
divs = [] | |
for div_class in div_classes: | |
divs.extend(soup.find_all('div', class_=div_class)) | |
url_fragment = url.split('#')[-1].replace('_', ' ') | |
log_date = url.split('/')[-1] | |
data = [] | |
for i, div in enumerate(divs): | |
try: | |
title = None | |
text_url = None | |
title_tag = div.find('a') | |
if title_tag: | |
title_span = div.find('span', {'data-mw-comment-start': True}) | |
if title_span: | |
title_anchor = title_span.find_next_sibling('a') | |
if title_anchor: | |
title = title_anchor.text | |
text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
else: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
if title == 'talk page' or title is None: | |
heading_tag = div.find('div', class_='mw-heading mw-heading3') | |
if heading_tag: | |
title_tag = heading_tag.find('a') | |
if title_tag: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
if not title: | |
continue | |
if title.lower() != url_fragment.lower(): | |
continue | |
next_div = div.find_next('div', class_='mw-heading mw-heading3') | |
deletion_discussion = '' | |
sibling = div.find_next_sibling() | |
while sibling and sibling != next_div: | |
deletion_discussion += str(sibling) | |
sibling = sibling.find_next_sibling() | |
label = '' | |
verdict_tag = div.find('p') | |
if verdict_tag: | |
label_b_tag = verdict_tag.find('b') | |
if label_b_tag: | |
label = label_b_tag.text.strip() | |
confirmation = '' | |
discussion_tag = div.find('dd') | |
if discussion_tag: | |
discussion_tag_i = discussion_tag.find('i') | |
if discussion_tag_i: | |
confirmation_b_tag = discussion_tag_i.find('b') | |
if confirmation_b_tag: | |
confirmation = confirmation_b_tag.text.strip() | |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
discussion = parts[0] if len(parts) > 0 else '' | |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
data.append([ title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) | |
except Exception as e: | |
print(f"Error processing div: {e}") | |
continue | |
df = pd.DataFrame(data, columns=[ 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
return df | |
def extract_post_links_text(discussion_html): | |
split_point = '<span class="plainlinks">' | |
if split_point in discussion_html: | |
parts = discussion_html.split(split_point) | |
if len(parts) > 1: | |
return parts[1] | |
return discussion_html | |
def process_discussion(df): | |
df['discussion_cleaned'] = df['verdict'].apply(extract_post_links_text) | |
return df | |
def html_to_plaintext(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
for tag in soup.find_all(['p', 'li', 'dd', 'dl']): | |
tag.insert_before('\n') | |
tag.insert_after('\n') | |
for br in soup.find_all('br'): | |
br.replace_with('\n') | |
text = soup.get_text(separator=' ', strip=True) | |
text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != '']) | |
return text | |
def process_html_to_plaintext(df): | |
df['discussion_cleaned'] = df['discussion_cleaned'].apply(html_to_plaintext) | |
df = df[['title', 'discussion_cleaned', 'label']] | |
return df | |
import pysbd | |
def split_text_into_sentences(text): | |
seg = pysbd.Segmenter(language="en", clean=False) | |
sentences = seg.segment(text) | |
return ' '.join(sentences[1:]) | |
def process_split_text_into_sentences(df): | |
df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences) | |
return df | |
def process_data(url): | |
df = extract_div_contents_from_url(url) | |
if df.at[0,'discussion'] == '': | |
df = extract_div_contents_from_url_new(url) | |
#print(df.head()) | |
df = process_discussion(df) | |
print(df.at[0,'discussion']) | |
df = process_html_to_plaintext(df) | |
df = process_split_text_into_sentences(df) | |
if not df.empty: | |
return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned'] | |
else: | |
return 'Empty DataFrame' | |