-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapper.py
39 lines (37 loc) · 1.13 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from bs4 import BeautifulSoup
import dryscrape
import re
import string
import freshdeskhack.rake as rake
import json
from operator import itemgetter, attrgetter, methodcaller
def get_jobs(url):
ret = { }
jobs = []
rake_object = rake.Rake("/root/freshack/Jobscraper/freshdeskhack/SmartStoplist.txt", 3, 2, 1)
dryscrape.start_xvfb()
session = dryscrape.Session()
session.visit(url)
html_page = session.body()
soup = BeautifulSoup(html_page, 'lxml')
master_tag = soup.find_all("div",class_="fd-posdesc")
for tag in master_tag:
job = { }
job["title"] = tag.h3.string
div_list = tag.find_all("div")
job_desc = ""
for childdiv in div_list:
text = childdiv.string
if text:
job_desc = job_desc+text
keywords = rake_object.run(job_desc)
words = []
for word in keywords:
if "year" not in word[0]:
words.append(word[0])
else:
job["experience"] = word[0]
job["keywords"] = words
jobs.append(job)
ret["jobs"] = jobs
return json.dumps(ret)