-
Notifications
You must be signed in to change notification settings - Fork 0
/
LaTex_Scrawler.py
67 lines (53 loc) · 2 KB
/
LaTex_Scrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import requests
from bs4 import BeautifulSoup
import json
url = ["https://tex.stackexchange.com/?tab=active",
"https://tex.stackexchange.com/?tab=bounties",
"https://tex.stackexchange.com/?tab=hot",
"https://tex.stackexchange.com/?tab=week",
"https://tex.stackexchange.com/?tab=month"]
Final_Result = []
for link in url:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'lxml')
Category = link.rsplit('=', 1)[-1]
Question_list = soup.find(id="question-mini-list")
Questions= Question_list.find_all(class_="question-summary")
for qt in Questions:
t = qt.find(class_="question-hyperlink")
Qsn = t.get_text()
link = t['href']
v = qt.find(class_="mini-counts")
vote = v.get_text()
s = qt.find(class_="status")
s1 = s.find(class_="mini-counts")
num_ans = s1.get_text()
vi = qt.find(class_="views")
vi1 = vi.find(class_="mini-counts")
view = vi1.get_text()
tag = qt.find(class_="tags")
post_tag=tag.find(class_="post-tag")
pt= post_tag.get_text()
start = qt.find(class_="started")
rt = start.find(class_="relativetime")
time = rt.get_text()
Au = start.get_text()
Au1 = Au.rsplit('\n')[-2]
Author = Au1.rsplit(' ')[-2]
rs = start.find(class_="reputation-score")
R_Score = rs.get_text()
record = {
'Question':Qsn,
'Votes':vote,
'number of answers':num_ans,
'Views':view,
'Tags':pt,
'Category':Category,
"Time": time,
"Author Name": Author,
"Reputation Score": R_Score,
}
Final_Result.append(record)
#Lets write these to a JSON file for now.
with open('LaTex.json', 'w') as outfile:
json.dump(Final_Result, outfile, indent=4)