File size: 5,941 Bytes
2aed2a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# -*- coding: utf-8 -*-
"""WarOnlineForum.ipynb"""

# Extracting messages from forum

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import urllib.request as urllib
import warnings
warnings.filterwarnings("ignore")

# initiate the corpus of Quote->Response texts
corpus = pd.DataFrame(columns=['Quote', 'Response'])

def remove_substring(string, substring):
    index = string.find(substring)
    if index != -1:
        start_index = string.rfind(" ", 0, index) + 1
        end_index = string.find(" ", index)
        if end_index == -1:
            end_index = len(string)
        return string[:start_index] + string[end_index:]
    return string

def remove_attachments(string, substring='Посмотреть вложение'):
  index = string.find(substring)
  if index != -1:
    end_index = string.find(" ", index)
    if end_index == -1:
      end_index = len(string)
      return string[:index] + string[end_index:]
  return string

def collectDataFromPage(url):
  # specify the URL of the XenForo forum page you want to extract messages from

  # send a request to the URL and get the HTML response
  response = requests.get(url)
  html = response.content

  # parse the HTML using BeautifulSoup
  soup = BeautifulSoup(response.content, "html.parser")

  # Find all elements with class "messageContent"
  message_contents = soup.find_all("div", class_="bbWrapper")

  # Loop through each messageContent element
  for message_content in message_contents:
    # Find the text within the messageContent element
    message_text = message_content.text.strip()
    
    # Find the quoted text within the messageContent element
    try:
      quoted_text = message_content.find("blockquote").text.strip()
      quoted_text = ''.join(BeautifulSoup(quoted_text, "html.parser").findAll(string=True))
      quoted_text = quoted_text.replace('Нажмите для раскрытия...', '')
      message_text = message_text.replace('Нажмите для раскрытия...', '')
      # Remove the text in between "bbCodeBlock-expandLink js-expandLink" and "</div>"
      
      
      # Print the message text and quoted text
      Quote = re.sub(r'http\S+', '', ' '.join(quoted_text.split()).partition('(а): ')[2])
      Quote = remove_substring(Quote,".com")
      Quote = remove_attachments(Quote)
      Quote = ' '.join(remove_substring(Quote,"@").split())
      
      Message = ' '.join(message_text.replace(quoted_text,'').split())
      Message = remove_substring(Message,".com")
      Message = remove_attachments(Message)
      Message = ' '.join(remove_substring(Message,"@").split())

      if Message and Quote:
        # corpus is a dataframe (global)
        corpus.loc[len(corpus)]=[Quote,Message]
        #print("Quoted Text:", Quote)
        #print("Message Text:", Message)
        #print('________________________')
    except:
      pass

def compare_pages(url1, url2):
    page1 = requests.get(url1).text
    page2 = requests.get(url2).text
    # Stupid, but must be working
    return len(page1) == len(page2)

def compare_pages2(url1, url2):
  return urllib.urlopen(url1).geturl() == urllib.urlopen(url2).geturl()


def pages_of_thread(thread,startingPage=1):
  page = startingPage
  lastPage = False
  while not lastPage:
    response = requests.get(thread+'/page-'+str(page))
    if response.status_code == 200:
      collectDataFromPage(url = thread+'/page-'+str(page))
      print(f'finished page #{page}')
      if not compare_pages2(thread+'/page-'+str(page),thread+'/page-'+str(page+1)):
        page+=1
      else:
        lastPage = True
    else:
      lastPage = True

  # Usage Example:
  #pages_of_thread(0,800) # Thread #0, starting page 800

"""______________________________________ Main Code __________________________________________"""

# Define the URLs to be crawled
base_url = 'https://waronline.org'
# Pehota base subforum
#url = "https://waronline.org/fora/index.php?forums/%D0%9F%D0%B5%D1%85%D0%BE%D1%82%D0%B0.3/"
# Obshevoyskovie base subforum
#url = "https://waronline.org/fora/index.php?forums/%D0%9E%D0%B1%D1%89%D0%B5%D0%B2%D0%BE%D0%B9%D1%81%D0%BA%D0%BE%D0%B2%D1%8B%D0%B5-%D1%82%D0%B5%D0%BC%D1%8B.4/"
# VMF
url = "https://waronline.org/fora/index.php?forums/%D0%92%D0%9C%D0%A4-%D0%B3%D1%80%D0%B0%D0%B6%D0%B4%D0%B0%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%84%D0%BB%D0%BE%D1%82.12/"

base_page = 1 #Starting with page-1
lastSubForumPage = False

while not lastSubForumPage:

  # Send a GET request to the URL
  response = requests.get(url+'page-'+str(base_page))
  forum_threads = [] #threads on this page of subforum

  # Check if the request was successful
  if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")
      
    # Get all the thread-links on the page
    links = soup.find_all("a")
      
    # Get the links
    for link in links:
      lnk = link.get("href")
      if lnk:
        if 'threads' in lnk:
          forum_threads.append((base_url+lnk).rsplit("/", 1)[0])

    # Clear the duplicate links
    forum_threads = list(set(forum_threads))
      
    for trd in forum_threads:
      pages_of_thread(trd) # Starting at page=1
      print(f'finished thread: {trd}')

    if not compare_pages2(url+'page-'+str(base_page),url+'page-'+str(base_page+1)):
      print(f'finished subforum page #{base_page}')
      base_page+=1
    else:
      lastSubForumPage = True

  else:
    print("Failed to load the page")
    lastSubForumPage = True

# Lowercase all
corpus['Quote'] = corpus['Quote'].apply(lambda x: x.lower() if isinstance(x,str) else x)
corpus['Response'] = corpus['Response'].apply(lambda x: x.lower() if isinstance(x,str) else x)

# Remove all non-alphanumericals
corpus.Quote.str.replace('[^a-zA-Z]', '')
corpus.Response.str.replace('[^a-zA-Z]', '')

#Export to csv
pathToDrive = ''
filename = 'part5.csv'
corpus.to_csv(pathToDrive+filename,index=False)