WarBot / waronlineforum.py
kertser's picture
Upload 5 files
2aed2a1
raw
history blame
5.94 kB
# -*- coding: utf-8 -*-
"""WarOnlineForum.ipynb"""
# Extracting messages from forum
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import urllib.request as urllib
import warnings
warnings.filterwarnings("ignore")
# initiate the corpus of Quote->Response texts
corpus = pd.DataFrame(columns=['Quote', 'Response'])
def remove_substring(string, substring):
index = string.find(substring)
if index != -1:
start_index = string.rfind(" ", 0, index) + 1
end_index = string.find(" ", index)
if end_index == -1:
end_index = len(string)
return string[:start_index] + string[end_index:]
return string
def remove_attachments(string, substring='Посмотреть вложение'):
index = string.find(substring)
if index != -1:
end_index = string.find(" ", index)
if end_index == -1:
end_index = len(string)
return string[:index] + string[end_index:]
return string
def collectDataFromPage(url):
# specify the URL of the XenForo forum page you want to extract messages from
# send a request to the URL and get the HTML response
response = requests.get(url)
html = response.content
# parse the HTML using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# Find all elements with class "messageContent"
message_contents = soup.find_all("div", class_="bbWrapper")
# Loop through each messageContent element
for message_content in message_contents:
# Find the text within the messageContent element
message_text = message_content.text.strip()
# Find the quoted text within the messageContent element
try:
quoted_text = message_content.find("blockquote").text.strip()
quoted_text = ''.join(BeautifulSoup(quoted_text, "html.parser").findAll(string=True))
quoted_text = quoted_text.replace('Нажмите для раскрытия...', '')
message_text = message_text.replace('Нажмите для раскрытия...', '')
# Remove the text in between "bbCodeBlock-expandLink js-expandLink" and "</div>"
# Print the message text and quoted text
Quote = re.sub(r'http\S+', '', ' '.join(quoted_text.split()).partition('(а): ')[2])
Quote = remove_substring(Quote,".com")
Quote = remove_attachments(Quote)
Quote = ' '.join(remove_substring(Quote,"@").split())
Message = ' '.join(message_text.replace(quoted_text,'').split())
Message = remove_substring(Message,".com")
Message = remove_attachments(Message)
Message = ' '.join(remove_substring(Message,"@").split())
if Message and Quote:
# corpus is a dataframe (global)
corpus.loc[len(corpus)]=[Quote,Message]
#print("Quoted Text:", Quote)
#print("Message Text:", Message)
#print('________________________')
except:
pass
def compare_pages(url1, url2):
page1 = requests.get(url1).text
page2 = requests.get(url2).text
# Stupid, but must be working
return len(page1) == len(page2)
def compare_pages2(url1, url2):
return urllib.urlopen(url1).geturl() == urllib.urlopen(url2).geturl()
def pages_of_thread(thread,startingPage=1):
page = startingPage
lastPage = False
while not lastPage:
response = requests.get(thread+'/page-'+str(page))
if response.status_code == 200:
collectDataFromPage(url = thread+'/page-'+str(page))
print(f'finished page #{page}')
if not compare_pages2(thread+'/page-'+str(page),thread+'/page-'+str(page+1)):
page+=1
else:
lastPage = True
else:
lastPage = True
# Usage Example:
#pages_of_thread(0,800) # Thread #0, starting page 800
"""______________________________________ Main Code __________________________________________"""
# Define the URLs to be crawled
base_url = 'https://waronline.org'
# Pehota base subforum
#url = "https://waronline.org/fora/index.php?forums/%D0%9F%D0%B5%D1%85%D0%BE%D1%82%D0%B0.3/"
# Obshevoyskovie base subforum
#url = "https://waronline.org/fora/index.php?forums/%D0%9E%D0%B1%D1%89%D0%B5%D0%B2%D0%BE%D0%B9%D1%81%D0%BA%D0%BE%D0%B2%D1%8B%D0%B5-%D1%82%D0%B5%D0%BC%D1%8B.4/"
# VMF
url = "https://waronline.org/fora/index.php?forums/%D0%92%D0%9C%D0%A4-%D0%B3%D1%80%D0%B0%D0%B6%D0%B4%D0%B0%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%84%D0%BB%D0%BE%D1%82.12/"
base_page = 1 #Starting with page-1
lastSubForumPage = False
while not lastSubForumPage:
# Send a GET request to the URL
response = requests.get(url+'page-'+str(base_page))
forum_threads = [] #threads on this page of subforum
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.content, "html.parser")
# Get all the thread-links on the page
links = soup.find_all("a")
# Get the links
for link in links:
lnk = link.get("href")
if lnk:
if 'threads' in lnk:
forum_threads.append((base_url+lnk).rsplit("/", 1)[0])
# Clear the duplicate links
forum_threads = list(set(forum_threads))
for trd in forum_threads:
pages_of_thread(trd) # Starting at page=1
print(f'finished thread: {trd}')
if not compare_pages2(url+'page-'+str(base_page),url+'page-'+str(base_page+1)):
print(f'finished subforum page #{base_page}')
base_page+=1
else:
lastSubForumPage = True
else:
print("Failed to load the page")
lastSubForumPage = True
# Lowercase all
corpus['Quote'] = corpus['Quote'].apply(lambda x: x.lower() if isinstance(x,str) else x)
corpus['Response'] = corpus['Response'].apply(lambda x: x.lower() if isinstance(x,str) else x)
# Remove all non-alphanumericals
corpus.Quote.str.replace('[^a-zA-Z]', '')
corpus.Response.str.replace('[^a-zA-Z]', '')
#Export to csv
pathToDrive = ''
filename = 'part5.csv'
corpus.to_csv(pathToDrive+filename,index=False)