pdfdir-join

#!/bin/bash
#
# pdfdir-join     http://github.com/bronson/pdfdir
# Scott Bronson
# 17 Mar 2009
#
# This script assembles a bunch of PDFs files into a single pdf with
# bookmarks to each file.
#
# Your files should be arranged in a hierarchy, somewhat like this:
#     book/01-Introduction.pdf
#     book/02-Engine
#     book/02-Engine/01-Oil.pdf
#     book/02-Engine/01-Freeze Plugs.pdf
# The "01-", "02-" only force the ordering of the files.  Everything before the
# first dash will be stripped from the bookmark name.
#
# Then just run the script:
#     $ pdfdir-join book
#
# TODO: should allow negative counts so the subsection will default to open
# TODO: it would not be too hard to preserve existing bookmarks.  Just need
# to read them, offset them, and include them in the new bookmark file.


die () {
	echo "$*" >&2
	[ -n "$streamfile" ] && rm -f "$streamfile"
	exit 1
}

root="${1%/}"
[ -z "$root" ] && die "Please specify a directory to operate on"
outbase="$(basename "$root")"
[ "x$outbase" = "x." ] && outbase=out
[ -f "$outbase".pdf ] && die "Will not overwrite $outbase.pdf"


# Turns the output from ghostscript into a bookmark file.
process_gs_output() {
	pat_filename='--- PDFDIR FILE: (.*)$'
	pat_newfile='Processing pages ([0-9]+) through ([0-9]+)'
	pat_newpage='Page ([0-9]+)'
	outpage=1
	corrupt=start

	while read line; do
		if [[ $line =~ $pat_filename ]]; then
			[ "$corrupt" = "yes" ] && echo -n " CORRUPT!"
			corrupt=yes
			curpath="${BASH_REMATCH[1]}"
			curpathno=$((curpathno + 1))
			[ -z "$curpath" ] && die "Wasn't told filename?"
			echo -e "\nFile $curpathno/$pathcount: $curpath"
		elif [[ $line =~ $pat_newfile ]]; then
			# check to see if ghostscript has lost its mind
			[ "x$totalpages" != "x$inpage" ] && die "Did not match: $inpage vs $totalpages"
			outpage=$((outpage + inpage))
			inpage=1
			totalpages="${BASH_REMATCH[2]}"
			echo "$outpage ${curpath#$root/}" >&6
			echo -n " $totalpages Page$([ "$totalpages" -eq 1 ] || echo -n s):"
		elif [[ $line =~ $pat_newpage ]]; then
			inpage="${BASH_REMATCH[1]}"
			echo -n " $((inpage+outpage-1))"
			corrupt=no
		fi
	done
	echo
	exec 6>&-
}

# 1 01 - Test One.pdf
# 2 02 - Section Two/01 - Test 2.1.pdf
# 3 02 - Section Two/02 - Test 2.2.pdf
# 4 03 - Test 3.pdf

tokenize_stream() {
	# todo: convert this back into shellscript
	exec ruby <(cat <<-EOS
		# splits two arrays into their similarities and differences.
		# given [a, b, c] and [a, b, d, e] returns [[a, b], [c], [d, e]]
		def common_prefix a, b
		  smaller_size = [a.length, b.length].min
		  smaller_size.times do |i|
		    if a[i] != b[i]
		      return a.take(i), a[i..-1], b[i..-1]
		    end
		  end
		  return a, a[smaller_size..-1], b[smaller_size..-1]
		end


		oldpath = []
		STDIN.each_line do |line|
		  page,fullpath = line.split(' ', 2)
		  path = fullpath.split('/')

		  prefix,outgoing,incoming = common_prefix(oldpath, path)
		  outgoing.reverse.each { |o| puts "pop #{o}" }
		  incoming.each { |o| puts "push #{page} #{o}" }
		  oldpath = path
		end
		oldpath.reverse.each { |o| puts "pop #{o}" }
	EOS
	)
}

# push 1 Test One
# pop 1 Test One
# push 2 Section Two
# push 2 Test 2.1
# pop Test 2.1
# push 3 Test 2.2
# pop Test 2.2
# pop Section Two
# push 4 Test 3
# pop Test 3

extract_marks() {
	# todo: convert this back into shellscript
	exec ruby <(cat <<-EOS
		def next_line
		  line = STDIN.readline
		  op,args = line.split(' ', 2)
		  page,args = args.split(' ', 2) if op == 'push'
		  return op, page, args
		end

		def process_tokens inpage=1, intitle=nil
		  result = []

		  until STDIN.eof?
		    op,page,title = next_line
		    if op == 'push'
		      nested = process_tokens(page, title)
		      result << "#{nested.length} #{page} #{title}"
		      result.concat nested
		    elsif op == 'pop'
		      raise "wtf: #{inpage} != #{page}" if intitle && intitle != title
		      return result
		    else
		      raise "wtf: #{op} #{page} #{title}"
		    end
		  end
		  return result
		end

		puts process_tokens.join
	EOS
	)
}

# 0 1 Test One
# 2 2 Section Two
# 0 2 Test 2.1
# 0 3 Test 2.2
# 0 4 Test 3

format_markfile() {
	while read count page title; do
		[ "$count" -eq 0 ] && count='' || count="/Count $count "
		title="${title#*-"${var%%[![:space:]]*}"}"  # remove everything before the first dash
		title="${title#" "}"                        # strip a single leading space if present
		title="${title%.[pP][dD][fF]}"              # remove trailing .pdf or .PDF
		title="$(echo "$title" | sed -e 's/\([()\\\]\)/\\\1/g')" # escape PDF string
		if [[ "$title" != :* ]]; then
			# don't add filenames beginning with a colon to the index
			echo "[ $count/Title ($title) /Page $page /OUT pdfmark"
		fi
	done
}

# [ /Title (Test One) /Page 1 /OUT pdfmark
# [ /Count 2 /Title (Section Two) /Page 2 /OUT pdfmark
# [ /Title (Test 2.1) /Page 2 /OUT pdfmark
# [ /Title (Test 2.2) /Page 3 /OUT pdfmark
# [ /Title (Test 3) /Page 4 /OUT pdfmark

# and we're done!


# ok lets go

if which ghostscript >/dev/null; then
	ghostscript=ghostscript
elif which gs >/dev/null; then
	ghostscript=gs
else
	echo 'Could not find ghostscript'
	exit 1
fi

pathfile="$outbase".paths
pathcount="$(find "$root" -name "*.pdf" -print | sort | tee "$pathfile" | wc -l | tr -d ' ')"
streamfile="$outbase".stream
exec 6>"$streamfile"

while read path; do
  escpath="$(echo "$path" | sed -e 's/\([()\]\)/\\\1/g')"
  echo "($escpath) dup (--- PDFDIR FILE: ) exch concatstrings = run"
# Explicitly use default settings so we can be as close to an unmodified passthrough as ghostscript will allow.
# Other potential values:
#   -dPDFSETTINGS=
#      screen: 1/200 the size and absolutely unacceptable
#      ebook: 1/20th the size and acceptable
#      printer: 1/2 the size and decent
#      prepress: 3/4 the size and not apprecialbly better than printer
#      default: looks pretty much unchanged, both file size and quality
# https://web.archive.org/web/20190212174315/https://www.ghostscript.com/doc/current/Ps2pdf.htm
# TODO: mess with ebook settings to try to increase contrast?
done < "$pathfile" | $ghostscript -sDEVICE=pdfwrite -sOUTPUTFILE="$outbase".nomarks.pdf -dNOPAUSE -dNOSAFER -dPrinted=false \
  -dCompatibilityLevel=1.4 -dPDFSETTINGS=/default | process_gs_output

if [ -f "$streamfile" ]; then
	echo
	echo "Applying bookmarks..."
	cat "$streamfile" | tokenize_stream | extract_marks | format_markfile > "$outbase".marks
	$ghostscript -dBATCH -dNOPAUSE -dPrinted=false -sDEVICE=pdfwrite -sOutputFile="$outbase".pdf "$outbase".nomarks.pdf "$outbase".marks
	rm "$streamfile" "$outbase".marks
	echo
	echo "PDF saved to $outbase".pdf
	echo
fi

rm "$pathfile"
rm -f "$outbase".nomarks.pdf