diff --git a/README.rst b/README.rst
index cd7e0628..1c2b04d9 100644
--- a/README.rst
+++ b/README.rst
@@ -72,7 +72,8 @@ instead::
 
    $ bash <(curl -Ls https://raw.githubusercontent.com/ic-labs/django-icekit/develop/startproject.sh) {project_name} develop
 
-and change the icekit branch in the generated :code:`requirements-icekit.txt` from :code:`@master` to :code:`@develop`.
+and change the icekit branch in the generated :code:`requirements-icekit.txt` and :code:`Dockerfile` from
+:code:`@master` to :code:`@develop`.
 
 NOTE: Windows users should run this command in Git Bash, which comes
 with `Git for Windows <https://git-for-windows.github.io/>`__.
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/collections/analyzing_data.rst b/docs/collections/analyzing_data.rst
new file mode 100644
index 00000000..a7a3706f
--- /dev/null
+++ b/docs/collections/analyzing_data.rst
@@ -0,0 +1,126 @@
+GLAMkit comes with tools for analysing and harvesting from large collections of various file types, currently:
+
+   * JSON files
+   * XML files with unwieldy or undefined schemas (which may also be badly-formed)
+   * MARC files which use an undocumented set of fields (which may also be badly-formed)
+
+Requirements
+------------
+The different data formats (except for JSON) use libraries that may not be installed by default. Alter the optional
+``import_*`` extras in your project's ``requirements-icekit.txt`` to install them, like this::
+
+   -e git+https://github.com/ic-labs/django-icekit@develop#egg=django-icekit[ ... ,import_xml]
+
+
+JSON Analysis
+=============
+
+*(to be documented)*
+
+XML Analysis
+============
+
+(Add ``import_xml`` to ``requirements-icekit.txt`` to install dependencies)
+
+``manage.py analyze_xml`` is a command-line tool that takes a path (or ``./``) and returns a csv file containing an
+analysis of every element in every xml file in the path. It requires the ``lxml`` library.
+
+Usage examples::
+
+    manage.py analyze_xml --help               # show help
+    manage.py analyze_xml -l                   # list all xml files to be analyzed
+    manage.py analyze_xml                      # analyze all xml files in the current path
+    manage.py analyze_xml > analysis.csv       # analyze all xml files in the current path and write the results to a csv file.
+    manage.py analyze_xml path/to/xml/         # analyze all xml files in the given path
+    manage.py analyze_xml path/to/file.xml     # analyze a single xml file
+    manage.py analyze_xml path/to/xml/ -r      # traverse the current path recursively
+
+The analysis csv contains these fields:
+
+=================    ==============================================================
+Column               Description
+=================    ==============================================================
+``path``             A dot-separated path to each XML tag.
+``min_cardinality``  The minimum number of these elements that each of its parents has.
+``max_cardinality``  The maximum number of these elements that each of its parents has.
+``samples``          Non-repeating sample values of the text within the XML tag.
+``attributes``       A list of all the attributes found for each tag.
+=================    ==============================================================
+
+
+Interpreting the analysis
+-------------------------
+
+path
+~~~~
+
+The path is dot-separated. A path that ``looks.like.this`` represents the <this> tag of a file structured like this::
+
+   <looks>
+      <like>
+         <this></this>
+      </like>
+   </looks>
+
+min/max_cardinality
+~~~~~~~~~~~~~~~~~~~
+
+``min_cardinality`` and ``max_cardinality`` will tell you the minimum and maximum number of these elements you'll have
+to deal with each time you encounter them. If a ``min_cardinality`` is 0, it means the element is optional. If a
+``max_cardinality`` is 1 it means that it's a singleton value. If ``max_cardinality`` is more than 1, it means that the
+element is repeated to make up a list.
+
+samples
+~~~~~~~
+
+``samples`` is a particularly useful field. Apart from seeing the values to discern their likely data type, you can
+see the variety of values produced.
+
+Set the number of samples to track with the ``--samplesize`` option. The default value is 5.
+
+If you asked for 5 sample values, but only got 1 value, that means the value is constant. If you get 2 values, that
+means there are only 2 values in the entire collection (which means that the value could be boolean). If you got 0
+values, that means the tag is always empty, or only ever contains children (see the next row of the csv file to see
+if an element has any children).
+
+The number of sample values can be set with the ``-n`` option to ``analyze_xml``, but you should keep it more than 3
+for easily discerning the range of values.
+
+attributes
+~~~~~~~~~~
+
+This field lists out all the attributes found for the tag, and a sample of their values.
+
+MARC Analysis
+=============
+
+(Add ``import_marc`` to ``requirements-icekit.txt`` to install dependencies)
+
+
+``manage.py analyze_marc`` is a command-line tool that takes a path (or ``./``) and returns a csv file containing an
+analysis of every MARC file found in the path.  It requires the ``pymarc`` library.
+
+Usage examples::
+
+    manage.py analyze_marc --help              # show help
+    manage.py analyze_marc -l                  # list all marc files to be analyzed
+    manage.py analyze_marc                     # analyze all MARC files in the current path
+    manage.py analyze_marc > analysis.csv      # analyze all MARC files in the current path and write the results to a csv file.
+    manage.py analyze_marc path/to/marc/       # analyze all MARC files in the given path
+    manage.py analyze_marc path/to/file.mrc    # analyze a single MARC file
+    manage.py analyze_marc path/to/marc/ -r    # traverse the current path recursively
+
+The analysis csv has a row for each tag (with an empty subfield column), and a row for each subfield. Each row contains
+these fields:
+
+=================    ==============================================================
+Column               Description
+=================    ==============================================================
+``tag``              The 3-digit MARC tag.
+``subfield``         The single-character subfield.
+``tag_meaning``      The English meaning of the tag/subfield, if known.
+``record_count``     The number of records that have at least one of these tags.
+``min_cardinality``  The minimum number of this tag or subfield that each record has.
+``max_cardinality``  The maximum number of this tag or subfield that each record has.
+``samples``          Non-repeating sample values of the values of each tag or subfield.
+=================    ==============================================================
diff --git a/docs/index.rst b/docs/index.rst
index 702af4a4..604fb8fb 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -31,6 +31,7 @@ This documentation covers the technical usage and API of GLAMkit.
 
    architecture/index
    topics/*
+   collections/*
    reference/*
    contributing/index
    changelog
diff --git a/glamkit_collections/management/commands/__init__.py b/glamkit_collections/management/commands/__init__.py
index e69de29b..66a296fd 100644
--- a/glamkit_collections/management/commands/__init__.py
+++ b/glamkit_collections/management/commands/__init__.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# −*− coding: UTF−8 −*−
+
+from optparse import make_option
+from django.core.management import BaseCommand
+from glamkit_collections.utils.files import getfiles
+
+
+class AnalysisCommand(BaseCommand):
+    help = "Prints a csv-formatted analysis of paths for all files found at the given paths."
+    file_regex = r"\.xml$"
+
+    option_list = BaseCommand.option_list + (
+        make_option('-r', '--recursive',
+            action='store_true',
+            dest='recursive',
+            default=False,
+            help="traverse the given folder recursively"
+        ),
+        make_option("-l", "--list",
+            action="store_true",
+            dest="list_only",
+            default=False,
+            help="only list the files that would be analyzed"
+        ),
+        make_option("-s", "--samplesize",
+            action="store",
+            dest="sample_length",
+            default=5,
+            help="provide this many samples of each element's text (default: 5)"
+        ),
+    )
+
+    def analyze(self, paths, sample_length):
+        raise NotImplementedError
+
+    def handle(self, *args, **options):
+        try:
+            path = args[0]
+        except IndexError:
+            path = "./"
+
+        paths = getfiles(path=path, regex=self.file_regex, recursive=options['recursive'])
+
+        if options['list_only']:
+            for p in paths:
+                print p
+        else:
+            self.analyze(paths, sample_length=options['sample_length'])
diff --git a/glamkit_collections/management/commands/analyze_marc.py b/glamkit_collections/management/commands/analyze_marc.py
new file mode 100755
index 00000000..eed30e21
--- /dev/null
+++ b/glamkit_collections/management/commands/analyze_marc.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# −*− coding: UTF−8 −*−
+
+from . import AnalysisCommand
+from glamkit_collections.utils.marc.analyze import marcanalyze
+
+
+class Command(AnalysisCommand):
+    help = "Prints a csv-formatted analysis of paths for all XML files found at the given paths."
+    file_regex = r"\.mrc$"
+
+    def analyze(self, paths, sample_length):
+        return marcanalyze(paths, sample_length)
+
diff --git a/glamkit_collections/management/commands/analyze_xml.py b/glamkit_collections/management/commands/analyze_xml.py
new file mode 100755
index 00000000..503bc7da
--- /dev/null
+++ b/glamkit_collections/management/commands/analyze_xml.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# −*− coding: UTF−8 −*−
+
+from glamkit_collections.utils.xml.lib.analyze import xmlanalyze
+from . import AnalysisCommand
+
+
+class Command(AnalysisCommand):
+    help = "Prints a csv-formatted analysis of paths for all MARC files found at the given paths."
+    file_regex = r"\.xml$"
+
+    def analyze(self, paths, sample_length):
+        return xmlanalyze(paths, sample_length)
+
diff --git a/glamkit_collections/utils/__init__.py b/glamkit_collections/utils/__init__.py
index e69de29b..6ec9cc2f 100644
--- a/glamkit_collections/utils/__init__.py
+++ b/glamkit_collections/utils/__init__.py
@@ -0,0 +1,7 @@
+# Legacy imports. TODO: deprecate using here
+
+from measurements import *
+from slugs import *
+from cleaning import *
+
+
diff --git a/glamkit_collections/utils/cleaning.py b/glamkit_collections/utils/cleaning.py
new file mode 100644
index 00000000..0c108ad2
--- /dev/null
+++ b/glamkit_collections/utils/cleaning.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+#  -*- coding: UTF-8 -*-
+import re
+
+import itertools
+
+
+def ensure_unique(qs, field_name, value, exclude_id=None):
+    """
+    Makes sure that `value` is unique on model.fieldname. And nonempty.
+    """
+    orig = value
+    if not value:
+        value = "None"
+    for x in itertools.count(1):
+        if not qs.exclude(id=exclude_id).filter(**{field_name: value}).exists():
+            break
+        if orig:
+            value = '%s-%d' % (orig, x)
+        else:
+            value = '%d' % x
+
+    return value
+
+
+def strip_parens(s):
+    result = re.sub(r'^\(', '', s)
+    result = re.sub(r'\)$', '', result)
+    return result
+
+
+def ndashify(s):
+    """replace ' - ' with an n-dash character"""
+    return re.sub(r' - ', u'–', unicode(s))
+
+
+def fix_line_breaks(s):
+    """
+    Convert \r\n and \r to \n chars. Strip any leading or trailing whitespace
+    on each line. Remove blank lines.
+    """
+    l = s.splitlines()
+    x = [i.strip() for i in l]
+    x = [i for i in x if i]  # remove blank lines
+    return "\n".join(x)
+
+
+def strip_line_breaks(s):
+    """
+    Remove \r and \n chars, replacing with a space. Strip leading/trailing
+    whitespace on each line. Remove blank lines.
+    """
+    return re.sub(r'[\r\n ]+', ' ', s).strip()
+
+
+def remove_url_breaking_chars(s):
+    r = re.sub(r'[\?#&/]', '', s)
+    return r.strip()
+
diff --git a/glamkit_collections/utils/files.py b/glamkit_collections/utils/files.py
new file mode 100644
index 00000000..00fdc29e
--- /dev/null
+++ b/glamkit_collections/utils/files.py
@@ -0,0 +1,25 @@
+import os
+import re
+
+def getfiles(path, regex=r"", recursive=True, followlinks=True):
+    """generates a list of file paths of files in given folder that match a given regex"""
+    
+    rex = re.compile(regex)
+
+    if os.path.isfile(path):
+        p = os.path.abspath(path)
+        if rex.search(p):
+            yield p
+    else: 
+        if recursive:    
+            for root, dirs, files in os.walk(path, followlinks):
+                for f in files:
+                    p = os.path.abspath(os.path.join(root, f))
+                    if rex.search(p):
+                        yield p
+        else:
+            for f in os.listdir(path):
+                p = os.path.abspath(os.path.join(path, f))
+                if os.path.isfile(p):
+                    if rex.search(p):
+                        yield p
\ No newline at end of file
diff --git a/glamkit_collections/utils/marc/__init__.py b/glamkit_collections/utils/marc/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/glamkit_collections/utils/marc/analyze.py b/glamkit_collections/utils/marc/analyze.py
new file mode 100644
index 00000000..993a5121
--- /dev/null
+++ b/glamkit_collections/utils/marc/analyze.py
@@ -0,0 +1,162 @@
+from pymarc import MARCReader
+from unicode_csv import UnicodeWriter
+import sys
+from pprint import pprint
+from tags import meaning
+
+def _update_analysis(analysis, record_analysis, sample_length):
+    # merge this record analysis into the global analysis dict
+
+    #first create 'empty' records for all currently known tags
+    for tag, stats in analysis.iteritems():
+        if not record_analysis.has_key(tag):
+            record_analysis[tag] = {
+                'cardinality': 0,
+                'samples': set(),
+                'subfields': {},
+            }
+            for subfield in stats['subfields']:
+                record_analysis[tag]['subfields'][subfield] = {
+                    'cardinality': 0,
+                    'samples': set(),
+                }
+
+
+    #now merge proper
+    for tag, stats in record_analysis.iteritems():
+        main_analysis = analysis.get(tag, {
+            'count': 0,
+            'min_cardinality': sys.maxint,
+            'max_cardinality': 0,
+            'subfields': {},
+            'samples': set(),
+        })
+        main_analysis['count'] += stats['cardinality']
+        main_analysis['min_cardinality'] = min(main_analysis['min_cardinality'], stats['cardinality'])
+        main_analysis['max_cardinality'] = max(main_analysis['max_cardinality'], stats['cardinality'])
+
+        if len(main_analysis['samples']) < sample_length:
+            #union the two sets, then clip to the sample length - needs to be a list to do this
+            main_analysis['samples'] = set(list(main_analysis['samples'].union(stats['samples']))[:sample_length])
+
+        #and now subfields. INCEPTION
+        for subfield, substats in stats['subfields'].iteritems():
+            main_sub_analysis = main_analysis['subfields'].get(subfield, {
+                'count': 0,
+                'min_cardinality': sys.maxint,
+                'max_cardinality': 0,
+                'subfields': {},
+                'samples': set(),
+            })
+            main_sub_analysis['count'] += substats['cardinality']
+            main_sub_analysis['min_cardinality'] = min(main_sub_analysis['min_cardinality'], substats['cardinality'])
+            main_sub_analysis['max_cardinality'] = max(main_sub_analysis['max_cardinality'], substats['cardinality'])
+
+            if len(main_sub_analysis['samples']) < sample_length:
+                #union the two sets, then clip to the sample length - needs to be a list to do this
+                main_sub_analysis['samples'] = set(list(main_sub_analysis['samples'].union(substats['samples']))[:sample_length])
+
+            main_analysis['subfields'][subfield] = main_sub_analysis
+
+        analysis[tag] = main_analysis
+    return analysis
+
+
+def multifile_iter_records(files, sample_length, analysis={}):
+    n = 0
+    for f in files:
+        if not hasattr(f, 'read'):
+            f = open(f)
+        reader = MARCReader(f, to_unicode=True)
+        for record in reader:
+            n += 1
+            if n % 1000 == 0:
+                sys.stderr.write("processed %s records\n" % n)
+            record_analysis = {}
+
+            fields = record.get_fields()
+            for field in fields:
+                attrdict = record_analysis.get(field.tag, {
+                    'cardinality': 0,
+                    'samples': set(),
+                    'subfields': {},
+                })
+
+                attrdict['cardinality'] += 1
+
+                if field.is_control_field():
+                    if len(attrdict['samples']) < sample_length:
+                        attrdict['samples'].add(field.data)
+                else:
+                    for subfield in field.get_subfield_tuples():
+                        key =subfield[0]
+                        sub_attrdict = attrdict['subfields'].get(key, {
+                            'cardinality': 0,
+                            'samples': set(),
+                        })
+
+                        sub_attrdict['cardinality'] += 1
+                        if len(sub_attrdict['samples']) < sample_length:
+                            sub_attrdict['samples'].add(subfield[1])
+
+                        attrdict['subfields'][key] = sub_attrdict
+
+                record_analysis[field.tag] = attrdict
+
+            analysis = _update_analysis(analysis, record_analysis, sample_length)
+
+    return analysis
+
+def marcanalyze(files, sample_length=5):
+    """
+    returns a csv of marc keys and analyzed values, showing, for example, how many records exist.
+
+    =================   ==============================================================
+    Column              Description
+    =================   ==============================================================
+    ``tag``             The 3-digit MARC tag.
+    ``subfield``        The single-character subfield.
+    ``tag_meaning``     The English meaning of the tag/subfield, if known.
+    ``record_count``    The number of records that have at least one of these tags.
+    ``min_cardinality``     The minimum number of this tag or subfield that each record has.
+    ``max_cardinality``     The maximum number of this tag or subfield that each record has.
+    ``samples``         Non-repeating sample values of the values of each tag or subfield.
+    =================   ==============================================================
+
+    """
+
+    analysis = multifile_iter_records(files, sample_length = sample_length)
+
+    csv_header=("tag", "subfield", "tag_meaning", "record_count", "min_cardinality", "max_cardinality","samples")
+
+
+    writer = UnicodeWriter(sys.stdout)
+    writer.writerow(csv_header)
+
+    listanalysis = [x for x in analysis.iteritems()]
+    listanalysis.sort()
+
+    for key, value in listanalysis:
+        v = []
+        v.append(u'"%s"' % key) #tag
+        v.append(u"") # subfield
+        v.append(meaning(key)) #tag_meaning
+        v.append(unicode(value['count'])) #record_count
+        v.append(unicode(value['min_cardinality']))
+        v.append(unicode(value['max_cardinality']))
+        v.append(u"\r\r".join(value['samples']))
+        writer.writerow(v)
+
+        listanalysis = [x for x in value['subfields'].iteritems()]
+        listanalysis.sort()
+        for subfield, value in listanalysis:
+            v = []
+            v.append("") #tag
+            v.append(subfield) # subfield
+            v.append(meaning(key, subfield)) #tag_meaning
+            v.append(unicode(value['count'])) #record_count
+            v.append(unicode(value['min_cardinality']))
+            v.append(unicode(value['max_cardinality']))
+            v.append(u"\r\r".join(value['samples']))
+            writer.writerow(v)
+
diff --git a/glamkit_collections/utils/marc/handler.py b/glamkit_collections/utils/marc/handler.py
new file mode 100644
index 00000000..1a9a94b1
--- /dev/null
+++ b/glamkit_collections/utils/marc/handler.py
@@ -0,0 +1,95 @@
+import sys
+from pymarc import MARCReader
+
+DEBUG_ON_IMPORT_SAVE_ERROR = False
+
+def multifile_iter_records(files):
+    for f in files:
+        if not hasattr(f, 'read'):
+            f = open(f)
+        reader = MARCReader(f, to_unicode=True)
+        for record in reader:
+            yield record
+
+class BaseHandler(object):
+    """
+    In subclasses, define 
+    
+    pre_harvest
+    
+    a clean(record), which is called with each marc record and returns a dictionary of parameters that could be sent to a model save().
+    
+    post_harvest
+    """
+
+    @staticmethod
+    def get_separated_subfields(record, tag, subfields, separator=" "):
+        results = []
+        for field in record.get_fields(tag):            
+            parts = []
+            for s in subfields:
+                part = field[s]
+                if part:
+                    parts.append(part.strip())
+        
+            results.append(separator.join(parts).replace(" :", ":").replace(" ;", ";"))
+        return results
+    
+    @staticmethod
+    def get_formatted_fields(record, tag):
+        return [field.format_field() for field in record.get_fields(tag)]    
+    
+    def __init__(self, model, pk="id"):
+        self.model = model
+        self.pk = pk
+    
+    def process(self, files, post_only=False):
+        if not post_only:
+            self.pre_harvest()
+            #this steps through given files, and calls hadle_elem after the end of each element.
+            fails = 0
+            count = 0
+
+            for record in multifile_iter_records(files):
+                # try:
+                    d = self.clean(record)
+                    if d is not None:
+                        try: #update (deleting from a RDBMS updates FK)
+                            q = { self.pk: d[self.pk] }
+                            m = self.model.objects.get(**q)
+                            del d[self.pk]
+                            for k,v in d.items():
+                                setattr(m, k, v)
+                            m.save()             
+                        except self.model.DoesNotExist:
+                            m = self.model(**d)
+                            m.save()
+                        count += 1
+                        if count % 100 == 0:
+                            print "saved %s items" % count
+                    else: #d is none (fail)
+                        fails += 1
+                        if fails % 10 == 0:
+                            print "SKIPPED %s items" % fails               
+                # except Exception as e:
+                #     if DEBUG_ON_IMPORT_SAVE_ERROR:
+                #         from pprint import pprint
+                #         pprint(record)
+                #         print "Cleaned data:"
+                #         pprint(d)
+                #         pprint(e)
+                #         import pdb; pdb.set_trace()
+                #     else:
+                #         raise e
+                        
+        self.post_harvest()
+        
+    
+    def pre_harvest(self):
+        pass
+    
+    def clean(self, record):
+        return None
+        
+    def post_harvest(self):
+        pass
\ No newline at end of file
diff --git a/glamkit_collections/utils/marc/tags.py b/glamkit_collections/utils/marc/tags.py
new file mode 100644
index 00000000..054b32ce
--- /dev/null
+++ b/glamkit_collections/utils/marc/tags.py
@@ -0,0 +1,109 @@
+TAGS = {
+    "001": u"Control Number - REQUIRED",
+    "003": u"Control Number Identifier - REQUIRED",
+    "005": u"Date and Time of Latest Transaction - REQUIRED",
+    "008": u"""Fixed - Length Data Elements - REQUIRED\r\n/00 - 05 Date of creation\r\n/06 Type of publication date\r\n/07 - 10 Date 1\r\n/11 - 14 Date 2\r\n/15 - 17 Country of publication code\r\n/35 - 37 Language of publication code""",
+    "020": u"ISBN",
+    "022": u"ISSN",
+    "028": u"Publisher Number",
+    "040": u"Cataloging Agency - REQUIRED",
+    "041": u"Language Codes",
+    "082": u"DDC Call Number",
+    "100": u"Main Entry - Personal Name",
+    "100 $a": u"Personal name",
+    "100 $b": u"Numeration",
+    "100 $c": u"Words associated with a name",
+    "100 $d": u"Dates associated with a name",
+    "110": u"Main Entry - Corporate Name",
+    "111": u"Main Entry - Meeting Name",
+    "111 $a": u"Meeting name entry element",
+    "111 $n": u"Number of meeting",
+    "111 $d": u"Year of meeting",
+    "111 $c": u"Place of meeting",
+    "242": u"Translation of Title",
+    "245": u"Title Statement",
+    "245 $a": u"Title proper",
+    "245 $h": u"General medium designator",
+    "245 $b": u"Other title information",
+    "245 $c": u"Statement of responsibility",
+    "246": u"Variant Title",
+    "247": u"Former Title",
+    "250": u"Edition Statement",
+    "254": u"Musical Presentation Statement",
+    "255": u"Cartographic Mathematical Data",
+    "260": u"Imprint - REQUIRED",
+    "260 $a": u"Place of publication",
+    "260 $b": u"Publisher/distributor",
+    "260 $c": u"Date(s of publication)",
+    "300": u"Physical Description",
+    "300 $a": u"Extent",
+    "300 $b": u"Other physical details",
+    "300 $c": u"Dimensions",
+    "310": u"Current Publication Frequency",
+    "362": u"Dates of Publication and/or Sequential Designation",
+    "440": u"Series Statement",
+    "440 $a": u"Main series title",
+    "440 $x": u"ISSN",
+    "440 $v": u"Volume/sequential designation",
+    "440 $n": u"and $p Subseries number and title",
+    "500": u"General Note",
+    "505": u"Formatted contents note",
+    "520": u"Summary",
+    "600": u"Subject - Personal Name",
+    "600 $a": u"Personal name",
+    "600 $b": u"Numeration",
+    "600 $c": u"Words associated with a name",
+    "600 $d": u"Dates associated with a name",
+    "600 $x": u"General subdivision",
+    "600 $2": u"Source of name",
+    "610": u"Subject - Corporate Name",
+    "610 $a": u"Corporate name entry element",
+    "610 $b": u"Subordinate unit",
+    "610 $x": u"General subdivision",
+    "610 $2": u"Source of name",
+    "611": u"Subject - Meeting Name",
+    "611 $a": u"Meeting name entry element",
+    "611 $n": u"Number of meeting",
+    "611 $d": u"Year of meeting",
+    "611 $c": u"Place of meeting",
+    "611 $x": u"General subdivision",
+    "611 $2": u"Source of name",
+    "630": u"Subject - Uniform Title",
+    "630 $a": u"Uniform title",
+    "630 $x": u"General subdivision",
+    "630 $2": u"Source of name",
+    "650": u"Subject - Topical Term",
+    "650 $a": u"Topical term",
+    "650 $x": u"General subdivision",
+    "650 $2": u"Source of name",
+    "651": u"Subject - Geographic Name",
+    "651 $a": u"Geographic name",
+    "651 $x": u"General subdivision",
+    "651 $2": u"Source of name",
+    "700": u"Personal Name",
+    "700 $a": u"Personal name",
+    "700 $b": u"Numeration",
+    "700 $c": u"Words associated with a name",
+    "700 $d": u"Dates associated with a name",
+    "710": u"Corporate Name",
+    "710 $a": u"Corporate name entry element",
+    "710 $b": u"Subordinate unit",
+    "711": u"Meeting Name",
+    "711 $a": u"Meeting name entry element",
+    "711 $n": u"Number of meeting",
+    "711 $d": u"Year of meeting",
+    "711 $c": u"Place of meeting",
+    "856": u"Electronic Location and Access",
+    "923": u"Local Acquisitions Information",
+    "923 $d": u"Invoice date",
+    "923 $n": u"Invoice number",
+    "923 $s": u"Source code",
+}
+
+def meaning(tag, subfield=""):
+    if subfield:
+        key = "%s $%s" % (tag, subfield)
+    else:
+        key = tag
+    return TAGS.get(key, u"")
+    
\ No newline at end of file
diff --git a/glamkit_collections/utils/marc/unicode_csv.py b/glamkit_collections/utils/marc/unicode_csv.py
new file mode 100644
index 00000000..5eb62f02
--- /dev/null
+++ b/glamkit_collections/utils/marc/unicode_csv.py
@@ -0,0 +1,30 @@
+import csv, codecs, cStringIO
+
+class UnicodeWriter:
+    """
+    A CSV writer which will write rows to CSV file "f",
+    which is encoded in the given encoding.
+    """
+
+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+        # Redirect output to a queue
+        self.queue = cStringIO.StringIO()
+        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+        self.stream = f
+        self.encoder = codecs.getincrementalencoder(encoding)()
+
+    def writerow(self, row):
+        self.writer.writerow([s.encode("utf-8") for s in row])
+        # Fetch UTF-8 output from the queue ...
+        data = self.queue.getvalue()
+        data = data.decode("utf-8")
+        # ... and reencode it into the target encoding
+        data = self.encoder.encode(data)
+        # write to the target stream
+        self.stream.write(data)
+        # empty queue
+        self.queue.truncate(0)
+
+    def writerows(self, rows):
+        for row in rows:
+            self.writerow(row)
\ No newline at end of file
diff --git a/glamkit_collections/utils.py b/glamkit_collections/utils/measurements.py
similarity index 54%
rename from glamkit_collections/utils.py
rename to glamkit_collections/utils/measurements.py
index 1f779fc1..8341115e 100644
--- a/glamkit_collections/utils.py
+++ b/glamkit_collections/utils/measurements.py
@@ -1,93 +1,8 @@
 #!/usr/bin/python
 #  -*- coding: UTF-8 -*-
-import itertools
 import unittest
-from django.utils.functional import allow_lazy
-from django.utils.safestring import mark_safe
-from django.utils.text import slugify
-import re
 from pyparsing import Optional, nums, Word, ParseException, Group
-import six
-from unidecode import unidecode
 
-def wikipedia_slugify(value, do_unidecode=False):
-    """
-    Converts to ASCII via unidecode.
-    Converts spaces to underscore.
-
-    Removes characters that
-    aren't alphanumerics, underscores, or hyphens.
-
-    Preserve case.
-
-    Also strips leading and trailing whitespace.
-    """
-    if do_unidecode:
-        value = unidecode(value)
-    value = value.strip()
-    return mark_safe(re.sub('[\s/#\?:@]+', '_', value))
-wikipedia_slugify = allow_lazy(wikipedia_slugify, six.text_type)
-
-
-def alt_slugify(value):
-    if value and value.strip():
-        return re.sub('[-_]', '', slugify(unicode(unidecode(value))))
-    else:
-        return ""
-alt_slugify = allow_lazy(alt_slugify, six.text_type)
-
-
-def ensure_unique(qs, field_name, value, exclude_id=None):
-    """
-    Makes sure that `value` is unique on model.fieldname. And nonempty.
-    """
-    orig = value
-    if not value:
-        value = "None"
-    for x in itertools.count(1):
-        if not qs.exclude(id=exclude_id).filter(**{field_name: value}).exists():
-            break
-        if orig:
-            value = '%s-%d' % (orig, x)
-        else:
-            value = '%d' % x
-
-    return value
-
-
-def strip_parens(s):
-    result = re.sub(r'^\(', '', s)
-    result = re.sub(r'\)$', '', result)
-    return result
-
-
-def ndashify(s):
-    """replace ' - ' with an n-dash character"""
-    return re.sub(r' - ', u'–', unicode(s))
-
-
-def fix_line_breaks(s):
-    """
-    Convert \r\n and \r to \n chars. Strip any leading or trailing whitespace
-    on each line. Remove blank lines.
-    """
-    l = s.splitlines()
-    x = [i.strip() for i in l]
-    x = [i for i in x if i]  # remove blank lines
-    return "\n".join(x)
-
-
-def strip_line_breaks(s):
-    """
-    Remove \r and \n chars, replacing with a space. Strip leading/trailing
-    whitespace on each line. Remove blank lines.
-    """
-    return re.sub(r'[\r\n ]+', ' ', s).strip()
-
-
-def remove_url_breaking_chars(s):
-    r = re.sub(r'[\?#&/]', '', s)
-    return r.strip()
 
 # oh yay we need parsers for imperial units. Thanks Obama.
 INTEGER = Word(nums).setParseAction(lambda t: [int(t[0])])
diff --git a/glamkit_collections/utils/slugs.py b/glamkit_collections/utils/slugs.py
new file mode 100644
index 00000000..4f940a4a
--- /dev/null
+++ b/glamkit_collections/utils/slugs.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python
+#  -*- coding: UTF-8 -*-
+
+import itertools
+from django.utils.functional import allow_lazy
+from django.utils.safestring import mark_safe
+from django.utils.text import slugify
+import re
+import six
+from unidecode import unidecode
+
+
+def wikipedia_slugify(value, do_unidecode=False):
+    """
+    Converts to ASCII via unidecode.
+    Converts spaces to underscore.
+
+    Removes characters that
+    aren't alphanumerics, underscores, or hyphens.
+
+    Preserve case.
+
+    Also strips leading and trailing whitespace.
+    """
+    if do_unidecode:
+        value = unidecode(value)
+    value = value.strip()
+    return mark_safe(re.sub('[\s/#\?:@]+', '_', value))
+wikipedia_slugify = allow_lazy(wikipedia_slugify, six.text_type)
+
+
+def alt_slugify(value):
+    """
+    More extreme version of slugify, unidecoding, and removing hyphens.
+
+    Useful for fallback slug values.
+    """
+    if value and value.strip():
+        return re.sub('[-_]', '', slugify(unicode(unidecode(value))))
+    else:
+        return ""
+alt_slugify = allow_lazy(alt_slugify, six.text_type)
diff --git a/glamkit_collections/utils/xml/__init__.py b/glamkit_collections/utils/xml/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/glamkit_collections/utils/xml/handler.py b/glamkit_collections/utils/xml/handler.py
new file mode 100644
index 00000000..a30527c2
--- /dev/null
+++ b/glamkit_collections/utils/xml/handler.py
@@ -0,0 +1,79 @@
+from lxml import etree
+from lxml.cssselect import CSSSelector
+from lib.constants import BREAK, DO_NOT_DISCARD
+import sys
+from lib.iterxml import multifile_iter_elems
+from lib.files import File
+
+def node(selector, kallable, **kwargs):
+    return {
+        'selector': selector,
+        'callable': kallable,
+        'kwargs': kwargs,
+    }
+
+class BaseHandler(object):
+    """
+    In subclasses, define nodes, mapping of CSS-style selectors to callables:
+    
+    nodes = (
+        node('topElement > Organisation', org_callable, **kwargs),
+        node('topElement > Person[id="person-345"]', person_345_callable, **kwargs),
+        node('topElement > Work', work_callable, **kwargs),
+        ...
+    )
+    
+    Only the value of the *final* matching callable is returned to the parent, so normally you want to do saving.
+    
+    Define pre_ and post_harvest handlers if you need. The post_harvest handler in particular should operate on as few records as possible, in order to make incremental changes feasible. (pre_ and post_harvest handlers quite often call similarly-named handlers in callable classes).
+    
+    Then call .process(files). Every time a selector is matched (in top-to-bottom order), the elem, and kwargs, will be passed to the corresponding callable.
+    
+    Each callable should:    
+        1) handle the contents of XML elem.
+        2) Log for itself that the handle has happened (e.g. for post-batch-handle cleaning, etc).
+        3) Return constants.DO_NOT_DISCARD if the memory taken by this elem is NOT to be freed up after the callable is called - ie another handler should run on the same data.
+    """
+    
+    def _make_handler_list(self):
+        self.HANDLERS = []
+        for node in self.handle_nodes:
+            #CSSSelector() compiles into XPath
+            namespaces = getattr(self, 'namespaces', {})
+            try:
+                self.HANDLERS.append(
+                    (
+                        CSSSelector(node['selector'], namespaces=namespaces), 
+                        node['callable'], node['kwargs']
+                    )
+                )
+            except Exception as e:
+                from pprint import pprint as pp; import pdb; pdb.set_trace()
+    
+    def __init__(self, *args, **kwargs):
+        self._make_handler_list()
+
+    def handle_elem(self, elem):
+        root = elem.getroottree().getroot()
+        #Do the cleaning
+        x = DO_NOT_DISCARD
+        for selector, kallable, kwargs in self.HANDLERS:
+            # get the elems that match this selector (as loaded into the root so far)
+            if elem in selector(root):
+                x = kallable(elem, **kwargs)
+                if x is not None and x & BREAK:
+                    break
+        return x         
+    
+    def process(self, files, post_only=False, encoding=None):
+        if not post_only:
+            self.pre_harvest()
+            #this steps through given files, and calls hadle_elem after the end of each element.
+            multifile_iter_elems(files, callable_start=None, callable_end=self.handle_elem, encoding=encoding)
+        self.post_harvest()
+        
+    def pre_harvest(self):
+        pass
+        
+    def post_harvest(self):
+        pass
\ No newline at end of file
diff --git a/glamkit_collections/utils/xml/lib/__init__.py b/glamkit_collections/utils/xml/lib/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/glamkit_collections/utils/xml/lib/analyze.py b/glamkit_collections/utils/xml/lib/analyze.py
new file mode 100644
index 00000000..a374b1b0
--- /dev/null
+++ b/glamkit_collections/utils/xml/lib/analyze.py
@@ -0,0 +1,89 @@
+from pprint import pprint
+import sys
+import csv
+csv_header=("path", "min_cardinality", "max_cardinality","samples", "attributes")
+from iterxml import multifile_iter_elems
+from utils import remove_ns, get_path
+
+def _get_children_from_analysis(path, analysis):
+    keys = analysis.keys()
+    for k in keys:
+        if k.startswith(path) and k != path: # k is a decendent, but not necessarily a child.
+            if len(k[len(path):].split('.')) == 2: #it's a child ['', 'child']
+                yield k
+
+
+def analyze_start(elem, analysis, sample_length):
+    path = get_path(elem)
+
+    if not analysis.has_key(path):
+        analysis[path] = {
+            'cardinality_current': 0,
+            'cardinality_min': sys.maxint,
+            'cardinality_max': 0,
+            'values': set(),
+            'attributes': {},
+        }
+
+    analysis[path]['cardinality_current'] += 1
+    #maintain max
+    if analysis[path]['cardinality_current'] > analysis[path]['cardinality_max']:
+        analysis[path]['cardinality_max'] = analysis[path]['cardinality_current']
+
+    #attributes
+    for attr in elem.keys():
+        av = analysis[path]['attributes'].get(attr, set())
+        if len(av) < sample_length:
+            av.add(elem.get(attr))
+            analysis[path]['attributes'][attr] = av
+
+
+def analyze_end(elem, analysis, sample_length):
+    path = get_path(elem)
+
+    # maintain min
+    for c in _get_children_from_analysis(path, analysis):
+        if analysis[c]['cardinality_current'] < analysis[c]['cardinality_min']:
+            analysis[c]['cardinality_min'] = analysis[c]['cardinality_current']
+        analysis[c]['cardinality_current'] = 0
+    # sample values
+    if len(analysis[path]['values']) < sample_length:
+        try:
+            v = elem.text.strip()
+            if v:
+                analysis[path]['values'].add(v)
+        except AttributeError:
+            pass
+
+def _attributestring(attrdict):
+    ss = []
+    for key, value in attrdict.iteritems():
+        s = "%s = (\"%s\")" % (remove_ns(key), "\", \"".join(value))
+        ss.append(s)
+
+    return "\r\n\r\n".join(ss)
+
+def xmlanalyze(files, sample_length=5):
+    """ returns a csv of xml paths and analyzed values, showing, for example, how many records exist for every path in an xml file """
+
+    analysis = {}
+
+    multifile_iter_elems(files, analyze_start, analyze_end, sample_length=sample_length, analysis=analysis)
+
+    writer = csv.writer(sys.stdout)
+    writer.writerow(csv_header)
+
+    listanalysis = [x for x in analysis.iteritems()]
+    listanalysis.sort()
+
+    for key, value in listanalysis:
+        v = []
+        v.append(key) #path
+        if value['cardinality_min'] == sys.maxint: #top-level nodes do this.
+            value['cardinality_min'] = value['cardinality_max']
+        v.append(value['cardinality_min'])
+        v.append(value['cardinality_max'])
+        v.append("\r\r".join(value['values']))
+        v.append(_attributestring(value['attributes']))
+
+        writer.writerow(v)
diff --git a/glamkit_collections/utils/xml/lib/constants.py b/glamkit_collections/utils/xml/lib/constants.py
new file mode 100644
index 00000000..811c2b7c
--- /dev/null
+++ b/glamkit_collections/utils/xml/lib/constants.py
@@ -0,0 +1,3 @@
+DISCARD_AFTER = 1 #None will work too
+DO_NOT_DISCARD = 2
+BREAK = 4
diff --git a/glamkit_collections/utils/xml/lib/files.py b/glamkit_collections/utils/xml/lib/files.py
new file mode 100644
index 00000000..73ebf652
--- /dev/null
+++ b/glamkit_collections/utils/xml/lib/files.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+
+# A file-like object that cleans up each line, if necessary.
+# This is useful if your xml file declares utf-8 and yet there are iso values in the actual file
+# http://stackoverflow.com/questions/2352840/parsing-broken-xml-with-lxml-etree-iterparse
+
+class File(object):
+    def __init__(self, filename):
+        self.f = open(filename, 'rt')
+
+    def read(self, size=None):
+        return self.f.next() #.replace('\x1e', '').replace('some other bad character...' ,'')
+        
\ No newline at end of file
diff --git a/glamkit_collections/utils/xml/lib/iterxml.py b/glamkit_collections/utils/xml/lib/iterxml.py
new file mode 100644
index 00000000..f0789c65
--- /dev/null
+++ b/glamkit_collections/utils/xml/lib/iterxml.py
@@ -0,0 +1,63 @@
+"""
+A pattern for iterating through a list of XML files, calling a callable at the start and end of each elem, and freeing up the memory used by that elem endwards.
+
+If callable returns constants.DO_NOT_DISCARD, then the memory is NOT freed up.
+
+Usage:
+
+    iter_elems(path_to_xml, callable_start, callable_end, start_args, start_kwargs, end_args, end_kwargs)
+
+or, for several XML files:
+
+    multifile_iter_elems(paths_to_xmls, callable_start, callable_end, start_args, start_kwargs, end_args, end_kwargs)
+
+"""
+from lxml import etree
+from constants import BREAK, DISCARD_AFTER
+import sys
+from files import File
+
+SKIP_UNTIL = 0
+
+def _fast_iter(context, callable_start, callable_end, *args, **kwargs):
+    _iter_count = kwargs.pop('_iter_count', 0)
+    for event, elem in context:
+        if _iter_count >= SKIP_UNTIL:
+            if event=="start":
+                status = callable_start(elem, *args, **kwargs)
+            elif event=="end":
+                status = callable_end(elem, *args, **kwargs)
+            if status is not None and status & BREAK:
+                break
+            if event=="end":
+                if status is None or status & DISCARD_AFTER:
+                    elem.clear()
+                    while elem.getprevious() is not None: #delete parent if I am the last item.
+                        del elem.getparent()[0]
+        _iter_count += 1
+        if _iter_count % 10000 == 0:
+            sys.stderr.write("processing %s elements...\n" % _iter_count)
+        # Work around bug #1185701 by bailing out after the end of the document root.
+        if elem.getparent() is None:
+            break
+    del context
+    return _iter_count
+
+
+def iter_elems(xml_file, callable_start, callable_end, encoding=None, *args, **kwargs):
+    kwargs['_iter_count'] = kwargs.get('_iter_count', 0)
+    events = []
+    if callable_start is not None:
+        events.append('start')
+    if callable_end is not None:
+        events.append('end')
+    context = etree.iterparse(xml_file, events=events, encoding=encoding)
+    return _fast_iter(context, callable_start, callable_end, *args, **kwargs)
+
+def multifile_iter_elems(xml_files, callable_start, callable_end, encoding=None, *args, **kwargs):
+    _iter_count = 0
+
+    for f in xml_files:
+        sys.stderr.write("===\nprocessing file %s\n===\n" % f)
+        kwargs['_iter_count'] = _iter_count
+        _iter_count = iter_elems(f, callable_start, callable_end, encoding, *args, **kwargs)
\ No newline at end of file
diff --git a/glamkit_collections/utils/xml/lib/utils.py b/glamkit_collections/utils/xml/lib/utils.py
new file mode 100644
index 00000000..8cf31493
--- /dev/null
+++ b/glamkit_collections/utils/xml/lib/utils.py
@@ -0,0 +1,16 @@
+import re
+
+NS_RE = re.compile(r"\{.*?\}")
+
+def remove_ns(tag):
+    return re.sub(NS_RE, "", tag)
+
+def get_path(elem, separator="."):
+    an = elem.iterancestors()
+    anlist = [remove_ns(e.tag) for e in an]
+    anlist.reverse()
+    anlist += [remove_ns(elem.tag)]
+
+    return separator.join(anlist)
+
+camelcase_to_underscore = lambda str: re.sub('(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))', '_\\1', str).lower().strip('_')
\ No newline at end of file
diff --git a/glamkit_collections/utils/xml/lib/xml2dict.py b/glamkit_collections/utils/xml/lib/xml2dict.py
new file mode 100644
index 00000000..1b5d2321
--- /dev/null
+++ b/glamkit_collections/utils/xml/lib/xml2dict.py
@@ -0,0 +1,27 @@
+from utils import remove_ns, camelcase_to_underscore
+
+def xml2dict(tag):
+    r = {}
+    
+    #value
+    # tag.text is None, for empty tags.
+    if tag.text is not None:
+        v = tag.text.strip()
+        r['_value'] = v
+    
+    #attributes
+    if tag.keys():
+        r['_attributes'] = {}
+    for k in tag.keys():
+        _k = remove_ns(k)
+        _k = camelcase_to_underscore(_k)
+        r['_attributes'][_k] = tag.get(k)
+        
+    for child in tag.getchildren():
+        ctag = remove_ns(child.tag)
+        ctag = camelcase_to_underscore(ctag)
+        #assuming every child is potentially a list
+        l = r.get(ctag, [])
+        l.append(xml2dict(child))
+        r[ctag] = l
+    return r
\ No newline at end of file
diff --git a/glamkit_collections/utils/xml/processors/__init__.py b/glamkit_collections/utils/xml/processors/__init__.py
new file mode 100644
index 00000000..f0e73fa6
--- /dev/null
+++ b/glamkit_collections/utils/xml/processors/__init__.py
@@ -0,0 +1,2 @@
+from base import *
+from mongo import *
\ No newline at end of file
diff --git a/glamkit_collections/utils/xml/processors/base.py b/glamkit_collections/utils/xml/processors/base.py
new file mode 100644
index 00000000..d6b67330
--- /dev/null
+++ b/glamkit_collections/utils/xml/processors/base.py
@@ -0,0 +1,14 @@
+import sys
+
+__all__ = ['BaseProcessor', 'DebugProcessor']
+
+class BaseProcessor(object):    
+    def __call__(self, tag):
+        raise NotImplemented("Subclasses of BaseSaver need to implement __call__().")
+
+class DebugProcessor(BaseProcessor):
+    def __init__(self, *args, **kwargs):
+        pass
+        
+    def __call__(self, tag):
+        sys.stderr.write("Fake saving %s\n" % (tag, ))
diff --git a/glamkit_collections/utils/xml/processors/django.py b/glamkit_collections/utils/xml/processors/django.py
new file mode 100644
index 00000000..8d1db1f0
--- /dev/null
+++ b/glamkit_collections/utils/xml/processors/django.py
@@ -0,0 +1,57 @@
+from base import BaseProcessor
+from ..lib.xml2dict import xml2dict
+import sys
+__all__ = ['DjangoSaver',]
+
+
+DEBUG_ON_IMPORT_SAVE_ERROR = True
+
+class DjangoSaver(BaseProcessor):
+    """
+    Convert XML to dict, clean dict, then put that dict into a django model instance and save it.
+ 
+    You'll want to override 'clean', and return the kwargs for the model instance creation.
+
+    We force the use of 'id' for PK.
+        
+    """
+    def __init__(self, model):
+        self.model = model
+        self.count = 0
+        self.fails = 0
+        
+    def clean(self, attribs):
+        return attribs
+    
+    def __call__(self, tag):
+        u = xml2dict(tag)
+        d = self.clean(u)
+                
+        try:
+            if d is not None:
+                try: #update (deleting from a RDBMS updates FK)
+                    m = self.model.objects.get(pk=d['id'])
+                    del d['id']
+                    for k,v in d.items():
+                        setattr(m, k, v)
+                    m.save()             
+                except self.model.DoesNotExist:
+                    m = self.model(**d)
+                    m.save()
+                self.count += 1
+                if self.count % 100 == 0:
+                    print "saved %s items" % self.count
+            else: #d is none (fail)
+                self.fails += 1
+                if self.fails % 10 == 0:
+                    print "SKIPPED %s items" % self.fails
+        
+        except Exception as e:
+            if DEBUG_ON_IMPORT_SAVE_ERROR:
+                from pprint import pprint
+                pprint(e)
+                pprint(u)
+                pprint(d)
+                import pdb; pdb.set_trace()
+            else:
+                raise e
\ No newline at end of file
diff --git a/glamkit_collections/utils/xml/processors/mongo.py b/glamkit_collections/utils/xml/processors/mongo.py
new file mode 100644
index 00000000..4e08e264
--- /dev/null
+++ b/glamkit_collections/utils/xml/processors/mongo.py
@@ -0,0 +1,56 @@
+from base import BaseProcessor
+from ..lib.xml2dict import xml2dict
+import sys
+__all__ = ['MongoSaver',]
+
+
+DEBUG_ON_IMPORT_SAVE_ERROR = True
+
+class MongoSaver(BaseProcessor):
+    """
+    Convert XML to dict, clean dict, then put that dict into a mongo document instance and save it.
+    
+    You'll want to override 'clean', and return the kwargs for the document instance creation.
+
+    We force the use of 'id' for PK.
+    """
+    def __init__(self, model):
+        self.model = model
+        self.count = 0
+        self.fails = 0
+        
+    def clean(self, attribs):
+        return attribs
+    
+    def __call__(self, tag):
+        u = xml2dict(tag)
+        d = self.clean(u)
+                
+        try:
+            if d is not None:
+                # try:
+                #     self.model.objects.get(id=d['id'])
+                # except self.model.DoesNotExist:
+                #     pass
+                m = self.model(**d)
+                m.save()
+                self.count += 1
+                if self.count % 100 == 0:
+                    print "saved %s items" % self.count
+            else: #d is none (fail)
+                self.fails += 1
+                if self.fails % 10 == 0:
+                    print "SKIPPED %s items" % self.fails
+        
+        except Exception as e:
+            if DEBUG_ON_IMPORT_SAVE_ERROR:
+                from pprint import pprint
+                print "Exception e"
+                pprint(e)
+                print "XML2Dict u"
+                pprint(u)
+                print "Cleaned dictionary d"
+                pprint(d)
+                import pdb; pdb.set_trace()
+            else:
+                raise e
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d83a4012..8d7efbdd 100644
--- a/setup.py
+++ b/setup.py
@@ -169,7 +169,13 @@ def find_packages(*paths):
             'colormath==2.1.1',
             # Disable as it's devpi-only
             # 'colorweave==0.1+0.ce27c83b4e06a8185531538fa11c18c5ea2c1aba.ixc',
-        ]
+        ],
+        'import_marc': [
+            'pymarc',
+        ],
+        'import_xml': [
+            'lxml',
+        ],
     },
     setup_requires=['setuptools_scm'],
 )