|
|
|
"""WarOnlineForum.ipynb""" |
|
|
|
|
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
import re |
|
import pandas as pd |
|
import urllib.request as urllib |
|
import warnings |
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
corpus = pd.DataFrame(columns=['Quote', 'Response']) |
|
|
|
def remove_substring(string, substring): |
|
index = string.find(substring) |
|
if index != -1: |
|
start_index = string.rfind(" ", 0, index) + 1 |
|
end_index = string.find(" ", index) |
|
if end_index == -1: |
|
end_index = len(string) |
|
return string[:start_index] + string[end_index:] |
|
return string |
|
|
|
def remove_attachments(string, substring='Посмотреть вложение'): |
|
index = string.find(substring) |
|
if index != -1: |
|
end_index = string.find(" ", index) |
|
if end_index == -1: |
|
end_index = len(string) |
|
return string[:index] + string[end_index:] |
|
return string |
|
|
|
def collectDataFromPage(url): |
|
|
|
|
|
|
|
response = requests.get(url) |
|
html = response.content |
|
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
|
|
|
message_contents = soup.find_all("div", class_="bbWrapper") |
|
|
|
|
|
for message_content in message_contents: |
|
|
|
message_text = message_content.text.strip() |
|
|
|
|
|
try: |
|
quoted_text = message_content.find("blockquote").text.strip() |
|
quoted_text = ''.join(BeautifulSoup(quoted_text, "html.parser").findAll(string=True)) |
|
quoted_text = quoted_text.replace('Нажмите для раскрытия...', '') |
|
message_text = message_text.replace('Нажмите для раскрытия...', '') |
|
|
|
|
|
|
|
|
|
Quote = re.sub(r'http\S+', '', ' '.join(quoted_text.split()).partition('(а): ')[2]) |
|
Quote = remove_substring(Quote,".com") |
|
Quote = remove_attachments(Quote) |
|
Quote = ' '.join(remove_substring(Quote,"@").split()) |
|
|
|
Message = ' '.join(message_text.replace(quoted_text,'').split()) |
|
Message = remove_substring(Message,".com") |
|
Message = remove_attachments(Message) |
|
Message = ' '.join(remove_substring(Message,"@").split()) |
|
|
|
if Message and Quote: |
|
|
|
corpus.loc[len(corpus)]=[Quote,Message] |
|
|
|
|
|
|
|
except: |
|
pass |
|
|
|
def compare_pages(url1, url2): |
|
page1 = requests.get(url1).text |
|
page2 = requests.get(url2).text |
|
|
|
return len(page1) == len(page2) |
|
|
|
def compare_pages2(url1, url2): |
|
return urllib.urlopen(url1).geturl() == urllib.urlopen(url2).geturl() |
|
|
|
|
|
def pages_of_thread(thread,startingPage=1): |
|
page = startingPage |
|
lastPage = False |
|
while not lastPage: |
|
response = requests.get(thread+'/page-'+str(page)) |
|
if response.status_code == 200: |
|
collectDataFromPage(url = thread+'/page-'+str(page)) |
|
print(f'finished page #{page}') |
|
if not compare_pages2(thread+'/page-'+str(page),thread+'/page-'+str(page+1)): |
|
page+=1 |
|
else: |
|
lastPage = True |
|
else: |
|
lastPage = True |
|
|
|
|
|
|
|
|
|
"""______________________________________ Main Code __________________________________________""" |
|
|
|
|
|
base_url = 'https://waronline.org' |
|
|
|
|
|
|
|
|
|
|
|
url = "https://waronline.org/fora/index.php?forums/%D0%92%D0%9C%D0%A4-%D0%B3%D1%80%D0%B0%D0%B6%D0%B4%D0%B0%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%84%D0%BB%D0%BE%D1%82.12/" |
|
|
|
base_page = 1 |
|
lastSubForumPage = False |
|
|
|
while not lastSubForumPage: |
|
|
|
|
|
response = requests.get(url+'page-'+str(base_page)) |
|
forum_threads = [] |
|
|
|
|
|
if response.status_code == 200: |
|
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
|
|
|
links = soup.find_all("a") |
|
|
|
|
|
for link in links: |
|
lnk = link.get("href") |
|
if lnk: |
|
if 'threads' in lnk: |
|
forum_threads.append((base_url+lnk).rsplit("/", 1)[0]) |
|
|
|
|
|
forum_threads = list(set(forum_threads)) |
|
|
|
for trd in forum_threads: |
|
pages_of_thread(trd) |
|
print(f'finished thread: {trd}') |
|
|
|
if not compare_pages2(url+'page-'+str(base_page),url+'page-'+str(base_page+1)): |
|
print(f'finished subforum page #{base_page}') |
|
base_page+=1 |
|
else: |
|
lastSubForumPage = True |
|
|
|
else: |
|
print("Failed to load the page") |
|
lastSubForumPage = True |
|
|
|
|
|
corpus['Quote'] = corpus['Quote'].apply(lambda x: x.lower() if isinstance(x,str) else x) |
|
corpus['Response'] = corpus['Response'].apply(lambda x: x.lower() if isinstance(x,str) else x) |
|
|
|
|
|
corpus.Quote.str.replace('[^a-zA-Z]', '') |
|
corpus.Response.str.replace('[^a-zA-Z]', '') |
|
|
|
|
|
pathToDrive = '' |
|
filename = 'part5.csv' |
|
corpus.to_csv(pathToDrive+filename,index=False) |