-
Notifications
You must be signed in to change notification settings - Fork 0
/
penguin_parser.py
102 lines (73 loc) · 3.34 KB
/
penguin_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import json
import os
import re
import urllib
import datetime
from lxml import html
import requests
from parser_info import parser_info_from_command_line_args
from post import *
def get_tree(base, subpage):
print("get tree: " + subpage)
page = requests.get(base + subpage)
return html.fromstring(page.content)
def next_page(base, tree):
next_page_link = tree.xpath('//a[@id="footprintListLoadMore"]')
if len(next_page_link) > 0:
next_page_url = next_page_link[0].attrib['href']
return get_tree(base, next_page_url)
return None
def extract_post(item):
post = Post()
post.id = item.xpath('../@data-id')[0]
content_container_path = './div[@class="footprint-container"]'
image_container_path = content_container_path + '/div[@class="images-container"]'
post.bookmark = ' '.join(item.xpath(content_container_path + '/div[@class="bookmark"]/span/text()'))
post.title = item.xpath(content_container_path + '/div[@class="title"]/a/h2/text()')[0]
post.date = item.xpath(content_container_path + '/div[@class="title"]/span[@class="date"]/span/@content')[0]
post.time = extract_time(content_container_path, item)
post.text = \
''.join(item.xpath(content_container_path + '/div[@class="text"]//p//text()')).strip().rsplit('Read more')[0]
post.images = ['https:' + elem for elem in item.xpath(image_container_path + '//img/../@data-url')]
return post
def extract_time(content_container_path, item):
time_string = item.xpath(content_container_path + '/div[@class="menuBox"]/aside[@class="info"]/span[3]/text()')[0]
time_matches = re.search('.*at ([\w:]*).*', time_string)
date_time = datetime.datetime.strptime(time_matches[1], '%I:%M%p') if ':' in time_matches[
1] else datetime.datetime.strptime(time_matches[1], '%I%p')
return date_time.strftime('%H-%M')
def create_dirs(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def store_image(image, storage_info):
image_name = '{0:03d}.jpg'.format(storage_info['counter'])
image_path = '{}/{}'.format(storage_info['image_dir'], image_name)
urllib.request.urlretrieve(image, image_path)
storage_info['counter'] += 1
return 'images/{}'.format(image_name)
def download(post, storage_path):
directory = '{}/{}/{}'.format(storage_path, post.date, post.time)
create_dirs(directory)
print('Download {} to {}'.format(post, directory))
image_dir = '{}/images'.format(directory)
if len(post.images) > 0:
create_dirs(image_dir)
counter = 1
storage_info = {'image_dir': image_dir, 'counter': counter}
post.images = [store_image(image, storage_info) for image in post.images]
meta_file = '{}/meta.json'.format(directory)
with open(meta_file, 'w') as file:
json.dump(post.__dict__, file)
def extract_posts(tree, storage_path):
raw_posts = tree.xpath('//article[@itemtype="http://schema.org/BlogPosting"]')
return [download(extract_post(item), storage_path) for item in raw_posts]
if __name__ == '__main__':
info = parser_info_from_command_line_args()
base = 'https://findpenguins.com'
subpage = '/{}?page=1'.format(info.trip)
tree = get_tree(base, subpage)
posts = []
while tree is not None:
posts.extend(extract_posts(tree, info.storage_path))
tree = next_page(base, tree)
print(len(posts))