forked from WING-NUS/ACL-Anthology-Codebase
-
Notifications
You must be signed in to change notification settings - Fork 0
/
AnthoXML2AcmCSV.rb
executable file
·151 lines (131 loc) · 3.99 KB
/
AnthoXML2AcmCSV.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env ruby
# -*- ruby -*-
@@BASE_DIR = "/home/antho/"
$:.unshift("#{@@BASE_DIR}/lib/")
require 'rubygems'
require 'optparse'
require 'ostruct'
require 'rexml/document'
require 'zip/zip'
require 'time'
include REXML
# defaults
@@VERSION = [1,0]
@@INTERVAL = 100
@@PROG_NAME = File.basename($0)
############################################################
# EXCEPTION HANDLING
int_handler = proc {
# clean up code goes here
STDERR.puts "\n# #{@@PROG_NAME} fatal\t\tReceived a 'SIGINT'\n# #{@@PROG_NAME}\t\texiting cleanly"
exit -1
}
trap "SIGINT", int_handler
############################################################
# PUT CLASS DEFINITION HERE
class AnthoXML2AcmCSV
def compile_filelist(filename)
infile = File.new(filename)
in_doc = Document.new infile
volume_id = in_doc.elements["volume"].attributes["id"]
# run through paper elements
filelist = Array.new
filelist << filename
in_doc.elements.each("*/paper/") { |e|
filelist << File.dirname(filename) + "/" + handle_ee(e,volume_id) + ".pdf"
}
return filelist
end
def process_file(filename)
infile = File.new(filename)
in_doc = Document.new infile
volume_id = in_doc.elements["volume"].attributes["id"]
# insert volume first line
retval = "http://www.aclweb.org/anthology/"
volume_url = File.basename(filename).gsub /\.xml/, ".pdf"
retval += "#{volume_url}\n"
# insert paper elements
count = 0
in_doc.elements.each("*/paper/") { |e|
count += 1
if count == 1 then next end
# print "count: #{count} #{e}\n"
row_elements = Array.new
# handle pages
row_elements << handle_pages(e)
# handle first author last name with tags
if e.elements["author/last"]
author_last = e.elements["author/last"].text
elsif e.elements["author"] # handle just names without markup. Assume last word is last name
full_name = e.elements["author"].text
name_elts = full_name.split
author_last = name_elts[-1]
else
row_elements << "" # no authors
end
row_elements << author_last
# handle electronic edition URL
row_elements << "http://www.aclweb.org/anthology/" + handle_ee(e, volume_id)
retval += row_elements.join(",") + "\n"
}
return retval
end
def handle_pages(e)
retval = ""
pages = e.elements["pages"]
# print "pages #{pages}\n"
if pages
if !match = /((\d+)\D+\d+)/.match(pages.text)
retval += pages.text
else
retval += match[2]
end
end
return retval
end
def handle_ee(e, volume)
# handle electronic editions
id = e.attributes["id"]
return "#{volume}-#{id}"
end
end
############################################################
# set up options
options = OpenStruct.new
options.zip = false
OptionParser.new do |opts|
opts.banner = "usage: #{@@PROG_NAME} [options] file_name"
opts.separator ""
opts.on_tail("-h", "--help", "Show this message") do STDERR.puts opts; exit end
opts.on_tail("-v", "--version", "Show version") do STDERR.puts "#{@@PROG_NAME} " + @@VERSION.join('.'); exit end
opts.on_tail("-z", "--make-zip") do |v| options.zip = v end
end.parse!
ARGV.each do |argv|
ax2ac = AnthoXML2AcmCSV.new
if options.zip
# make csv file
rootname = File.basename(argv.gsub(/.xml/,""))
csv = File.new("/tmp/#{rootname}.csv","w")
buf = ax2ac.process_file(argv)
csv.print buf
print buf
csv.close
# compile list of files
filelist = ax2ac.compile_filelist(argv)
# make zipfile
Zip::ZipFile.open("#{rootname}.zip", Zip::ZipFile::CREATE) { |zf|
zf.mkdir(rootname)
filelist.each do |f|
if !File.exists?(f)
$stderr.puts "# #{@@PROG_NAME} warn\t\tFile \"#{f}\" does not exist!\n"
else
zf.add("#{rootname}/" + File.basename(f),f)
end
end
zf.add("#{rootname}/#{rootname}.csv","/tmp/#{rootname}.csv")
}
File.unlink("/tmp/#{rootname}.csv")
else
print ax2ac.process_file(argv)
end
end