d0r1h commited on
Commit
66f6b0b
1 Parent(s): d17a7e8

Create extractdata.py

Browse files

scrapping text from given url

Files changed (1) hide show
  1. extractdata.py +33 -0
extractdata.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+
5
+
6
+ noise1 = re.compile(r"[([].*?[\)\]]\s+") # वर्ल्ड कप 2019 (World Cup 2019) --> वर्ल्ड कप 2019
7
+ noise2 = re.compile(r"\{.*?\}") # { googletag.display{ googletag.display(div-gpt-ad-1517823702248-0); });} }
8
+ noise3 = re.compile(r"[a-zA-Z]")
9
+ noise4 = re.compile(r"[\{()#@:%,_;&!=}\]]")
10
+ noise5 = re.compile(r'[\?\]]')
11
+
12
+
13
+ def extract_text(url):
14
+
15
+ data = requests.get(url)
16
+ soup = BeautifulSoup(data.content, "html.parser")
17
+
18
+ try:
19
+ vistaar = soup.find(class_ = "article-desc ul_styling")
20
+ vistaar = vistaar.text
21
+ except Exception as e:
22
+ print(f"Not able to fetch text {e}")
23
+
24
+ vistaar = vistaar.replace("विस्तार ", ' ')
25
+ vistaar = vistaar.replace("विज्ञापन", ' ')
26
+ vistaar = vistaar.replace("\n", ' ')
27
+ vistaar = re.sub('\xa0', ' ', vistaar)
28
+ vistaar = re.sub(noise2, ' ', vistaar)
29
+ vistaar = re.sub(noise3, ' ', vistaar)
30
+ vistaar = re.sub(noise4, ' ', vistaar)
31
+ vistaar = re.sub(' +', ' ', vistaar)
32
+
33
+ return vistaar