-
Notifications
You must be signed in to change notification settings - Fork 5
/
scraper.rb
61 lines (52 loc) · 1.99 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
require 'scraperwiki'
require 'mechanize'
agent = Mechanize.new
comment_url = "mailto:[email protected]"
base_url = "http://www.melbourne.vic.gov.au/building-and-development/property-information/planning-building-registers/Pages/town-planning-permits-register-search-results.aspx"
# Get applications from the last two weeks
start_date = (Date.today - 14).strftime("%d/%m/%Y")
end_date = Date.today.strftime("%d/%m/%Y")
page = 1
all_urls = []
begin
url = "#{base_url}?std=#{start_date}&end=#{end_date}&page=#{page}"
puts "Fetching #{url}"
p = agent.get(url)
urls = p.search('table.permits-list .detail .column1 a').map{|a| a["href"]}
all_urls += urls
page += 1
# FIXME: This is just working around an infinite loop that we currently have
raise "15 pages processed: aborting due to probably infinite loop" if page == 15
end until urls.count == 0
all_urls.each do |url|
puts "Fetching #{url}"
p = agent.get(url)
record = {"info_url" => p.uri.to_s, "comment_url" => comment_url, "date_scraped" => Date.today.to_s}
p.at('.permit-detail').search('tr').each do |tr|
heading = tr.at('th').inner_text
value = tr.at('td').inner_text
case heading
when "Application number"
record["council_reference"] = value
when "Date received"
day, month, year = value.split("/")
record["date_received"] = Date.new(year.to_i, month.to_i, day.to_i).to_s
when "Address"
t = value.split("(").first
if t
record["address"] = t.strip
else
record["address"] = ""
end
when "Applicant's Name and Address", "Planning officer", "Objections received", "Application status",
"Decision", "Expiry Date", "Change to Application", "VicSmart application", "", "Amendments to permit"
# Do nothing with this
when "Proposal"
record["description"] = value
else
#Need to find better way to handle exceptions
raise "Unexpected #{heading}"
end
end
ScraperWiki.save_sqlite(['council_reference'], record)
end