-
Notifications
You must be signed in to change notification settings - Fork 4
/
pdfdir-join
executable file
·227 lines (200 loc) · 6.51 KB
/
pdfdir-join
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/bin/bash
#
# pdfdir-join http://github.com/bronson/pdfdir
# Scott Bronson
# 17 Mar 2009
#
# This script assembles a bunch of PDFs files into a single pdf with
# bookmarks to each file.
#
# Your files should be arranged in a hierarchy, somewhat like this:
# book/01-Introduction.pdf
# book/02-Engine
# book/02-Engine/01-Oil.pdf
# book/02-Engine/01-Freeze Plugs.pdf
# The "01-", "02-" only force the ordering of the files. Everything before the
# first dash will be stripped from the bookmark name.
#
# Then just run the script:
# $ pdfdir-join book
#
# TODO: should allow negative counts so the subsection will default to open
# TODO: it would not be too hard to preserve existing bookmarks. Just need
# to read them, offset them, and include them in the new bookmark file.
die () {
echo "$*" >&2
[ -n "$streamfile" ] && rm -f "$streamfile"
exit 1
}
root="${1%/}"
[ -z "$root" ] && die "Please specify a directory to operate on"
outbase="$(basename "$root")"
[ "x$outbase" = "x." ] && outbase=out
[ -f "$outbase".pdf ] && die "Will not overwrite $outbase.pdf"
# Turns the output from ghostscript into a bookmark file.
process_gs_output() {
pat_filename='--- PDFDIR FILE: (.*)$'
pat_newfile='Processing pages ([0-9]+) through ([0-9]+)'
pat_newpage='Page ([0-9]+)'
outpage=1
corrupt=start
while read line; do
if [[ $line =~ $pat_filename ]]; then
[ "$corrupt" = "yes" ] && echo -n " CORRUPT!"
corrupt=yes
curpath="${BASH_REMATCH[1]}"
curpathno=$((curpathno + 1))
[ -z "$curpath" ] && die "Wasn't told filename?"
echo -e "\nFile $curpathno/$pathcount: $curpath"
elif [[ $line =~ $pat_newfile ]]; then
# check to see if ghostscript has lost its mind
[ "x$totalpages" != "x$inpage" ] && die "Did not match: $inpage vs $totalpages"
outpage=$((outpage + inpage))
inpage=1
totalpages="${BASH_REMATCH[2]}"
echo "$outpage ${curpath#$root/}" >&6
echo -n " $totalpages Page$([ "$totalpages" -eq 1 ] || echo -n s):"
elif [[ $line =~ $pat_newpage ]]; then
inpage="${BASH_REMATCH[1]}"
echo -n " $((inpage+outpage-1))"
corrupt=no
fi
done
echo
exec 6>&-
}
# 1 01 - Test One.pdf
# 2 02 - Section Two/01 - Test 2.1.pdf
# 3 02 - Section Two/02 - Test 2.2.pdf
# 4 03 - Test 3.pdf
tokenize_stream() {
# todo: convert this back into shellscript
exec ruby <(cat <<-EOS
# splits two arrays into their similarities and differences.
# given [a, b, c] and [a, b, d, e] returns [[a, b], [c], [d, e]]
def common_prefix a, b
smaller_size = [a.length, b.length].min
smaller_size.times do |i|
if a[i] != b[i]
return a.take(i), a[i..-1], b[i..-1]
end
end
return a, a[smaller_size..-1], b[smaller_size..-1]
end
oldpath = []
STDIN.each_line do |line|
page,fullpath = line.split(' ', 2)
path = fullpath.split('/')
prefix,outgoing,incoming = common_prefix(oldpath, path)
outgoing.reverse.each { |o| puts "pop #{o}" }
incoming.each { |o| puts "push #{page} #{o}" }
oldpath = path
end
oldpath.reverse.each { |o| puts "pop #{o}" }
EOS
)
}
# push 1 Test One
# pop 1 Test One
# push 2 Section Two
# push 2 Test 2.1
# pop Test 2.1
# push 3 Test 2.2
# pop Test 2.2
# pop Section Two
# push 4 Test 3
# pop Test 3
extract_marks() {
# todo: convert this back into shellscript
exec ruby <(cat <<-EOS
def next_line
line = STDIN.readline
op,args = line.split(' ', 2)
page,args = args.split(' ', 2) if op == 'push'
return op, page, args
end
def process_tokens inpage=1, intitle=nil
result = []
until STDIN.eof?
op,page,title = next_line
if op == 'push'
nested = process_tokens(page, title)
result << "#{nested.length} #{page} #{title}"
result.concat nested
elsif op == 'pop'
raise "wtf: #{inpage} != #{page}" if intitle && intitle != title
return result
else
raise "wtf: #{op} #{page} #{title}"
end
end
return result
end
puts process_tokens.join
EOS
)
}
# 0 1 Test One
# 2 2 Section Two
# 0 2 Test 2.1
# 0 3 Test 2.2
# 0 4 Test 3
format_markfile() {
while read count page title; do
[ "$count" -eq 0 ] && count='' || count="/Count $count "
title="${title#*-"${var%%[![:space:]]*}"}" # remove everything before the first dash
title="${title#" "}" # strip a single leading space if present
title="${title%.[pP][dD][fF]}" # remove trailing .pdf or .PDF
title="$(echo "$title" | sed -e 's/\([()\\\]\)/\\\1/g')" # escape PDF string
if [[ "$title" != :* ]]; then
# don't add filenames beginning with a colon to the index
echo "[ $count/Title ($title) /Page $page /OUT pdfmark"
fi
done
}
# [ /Title (Test One) /Page 1 /OUT pdfmark
# [ /Count 2 /Title (Section Two) /Page 2 /OUT pdfmark
# [ /Title (Test 2.1) /Page 2 /OUT pdfmark
# [ /Title (Test 2.2) /Page 3 /OUT pdfmark
# [ /Title (Test 3) /Page 4 /OUT pdfmark
# and we're done!
# ok lets go
if which ghostscript >/dev/null; then
ghostscript=ghostscript
elif which gs >/dev/null; then
ghostscript=gs
else
echo 'Could not find ghostscript'
exit 1
fi
pathfile="$outbase".paths
pathcount="$(find "$root" -name "*.pdf" -print | sort | tee "$pathfile" | wc -l | tr -d ' ')"
streamfile="$outbase".stream
exec 6>"$streamfile"
while read path; do
escpath="$(echo "$path" | sed -e 's/\([()\]\)/\\\1/g')"
echo "($escpath) dup (--- PDFDIR FILE: ) exch concatstrings = run"
# Explicitly use default settings so we can be as close to an unmodified passthrough as ghostscript will allow.
# Other potential values:
# -dPDFSETTINGS=
# screen: 1/200 the size and absolutely unacceptable
# ebook: 1/20th the size and acceptable
# printer: 1/2 the size and decent
# prepress: 3/4 the size and not apprecialbly better than printer
# default: looks pretty much unchanged, both file size and quality
# https://web.archive.org/web/20190212174315/https://www.ghostscript.com/doc/current/Ps2pdf.htm
# TODO: mess with ebook settings to try to increase contrast?
done < "$pathfile" | $ghostscript -sDEVICE=pdfwrite -sOUTPUTFILE="$outbase".nomarks.pdf -dNOPAUSE -dNOSAFER -dPrinted=false \
-dCompatibilityLevel=1.4 -dPDFSETTINGS=/default | process_gs_output
if [ -f "$streamfile" ]; then
echo
echo "Applying bookmarks..."
cat "$streamfile" | tokenize_stream | extract_marks | format_markfile > "$outbase".marks
$ghostscript -dBATCH -dNOPAUSE -dPrinted=false -sDEVICE=pdfwrite -sOutputFile="$outbase".pdf "$outbase".nomarks.pdf "$outbase".marks
rm "$streamfile" "$outbase".marks
echo
echo "PDF saved to $outbase".pdf
echo
fi
rm "$pathfile"
rm -f "$outbase".nomarks.pdf