diff --git a/README.rst b/README.rst index cd7e0628..1c2b04d9 100644 --- a/README.rst +++ b/README.rst @@ -72,7 +72,8 @@ instead:: $ bash <(curl -Ls https://raw.githubusercontent.com/ic-labs/django-icekit/develop/startproject.sh) {project_name} develop -and change the icekit branch in the generated :code:`requirements-icekit.txt` from :code:`@master` to :code:`@develop`. +and change the icekit branch in the generated :code:`requirements-icekit.txt` and :code:`Dockerfile` from +:code:`@master` to :code:`@develop`. NOTE: Windows users should run this command in Git Bash, which comes with `Git for Windows `__. diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/collections/analyzing_data.rst b/docs/collections/analyzing_data.rst new file mode 100644 index 00000000..a7a3706f --- /dev/null +++ b/docs/collections/analyzing_data.rst @@ -0,0 +1,126 @@ +GLAMkit comes with tools for analysing and harvesting from large collections of various file types, currently: + + * JSON files + * XML files with unwieldy or undefined schemas (which may also be badly-formed) + * MARC files which use an undocumented set of fields (which may also be badly-formed) + +Requirements +------------ +The different data formats (except for JSON) use libraries that may not be installed by default. Alter the optional +``import_*`` extras in your project's ``requirements-icekit.txt`` to install them, like this:: + + -e git+https://github.com/ic-labs/django-icekit@develop#egg=django-icekit[ ... ,import_xml] + + +JSON Analysis +============= + +*(to be documented)* + +XML Analysis +============ + +(Add ``import_xml`` to ``requirements-icekit.txt`` to install dependencies) + +``manage.py analyze_xml`` is a command-line tool that takes a path (or ``./``) and returns a csv file containing an +analysis of every element in every xml file in the path. It requires the ``lxml`` library. + +Usage examples:: + + manage.py analyze_xml --help # show help + manage.py analyze_xml -l # list all xml files to be analyzed + manage.py analyze_xml # analyze all xml files in the current path + manage.py analyze_xml > analysis.csv # analyze all xml files in the current path and write the results to a csv file. + manage.py analyze_xml path/to/xml/ # analyze all xml files in the given path + manage.py analyze_xml path/to/file.xml # analyze a single xml file + manage.py analyze_xml path/to/xml/ -r # traverse the current path recursively + +The analysis csv contains these fields: + +================= ============================================================== +Column Description +================= ============================================================== +``path`` A dot-separated path to each XML tag. +``min_cardinality`` The minimum number of these elements that each of its parents has. +``max_cardinality`` The maximum number of these elements that each of its parents has. +``samples`` Non-repeating sample values of the text within the XML tag. +``attributes`` A list of all the attributes found for each tag. +================= ============================================================== + + +Interpreting the analysis +------------------------- + +path +~~~~ + +The path is dot-separated. A path that ``looks.like.this`` represents the tag of a file structured like this:: + + + + + + + +min/max_cardinality +~~~~~~~~~~~~~~~~~~~ + +``min_cardinality`` and ``max_cardinality`` will tell you the minimum and maximum number of these elements you'll have +to deal with each time you encounter them. If a ``min_cardinality`` is 0, it means the element is optional. If a +``max_cardinality`` is 1 it means that it's a singleton value. If ``max_cardinality`` is more than 1, it means that the +element is repeated to make up a list. + +samples +~~~~~~~ + +``samples`` is a particularly useful field. Apart from seeing the values to discern their likely data type, you can +see the variety of values produced. + +Set the number of samples to track with the ``--samplesize`` option. The default value is 5. + +If you asked for 5 sample values, but only got 1 value, that means the value is constant. If you get 2 values, that +means there are only 2 values in the entire collection (which means that the value could be boolean). If you got 0 +values, that means the tag is always empty, or only ever contains children (see the next row of the csv file to see +if an element has any children). + +The number of sample values can be set with the ``-n`` option to ``analyze_xml``, but you should keep it more than 3 +for easily discerning the range of values. + +attributes +~~~~~~~~~~ + +This field lists out all the attributes found for the tag, and a sample of their values. + +MARC Analysis +============= + +(Add ``import_marc`` to ``requirements-icekit.txt`` to install dependencies) + + +``manage.py analyze_marc`` is a command-line tool that takes a path (or ``./``) and returns a csv file containing an +analysis of every MARC file found in the path. It requires the ``pymarc`` library. + +Usage examples:: + + manage.py analyze_marc --help # show help + manage.py analyze_marc -l # list all marc files to be analyzed + manage.py analyze_marc # analyze all MARC files in the current path + manage.py analyze_marc > analysis.csv # analyze all MARC files in the current path and write the results to a csv file. + manage.py analyze_marc path/to/marc/ # analyze all MARC files in the given path + manage.py analyze_marc path/to/file.mrc # analyze a single MARC file + manage.py analyze_marc path/to/marc/ -r # traverse the current path recursively + +The analysis csv has a row for each tag (with an empty subfield column), and a row for each subfield. Each row contains +these fields: + +================= ============================================================== +Column Description +================= ============================================================== +``tag`` The 3-digit MARC tag. +``subfield`` The single-character subfield. +``tag_meaning`` The English meaning of the tag/subfield, if known. +``record_count`` The number of records that have at least one of these tags. +``min_cardinality`` The minimum number of this tag or subfield that each record has. +``max_cardinality`` The maximum number of this tag or subfield that each record has. +``samples`` Non-repeating sample values of the values of each tag or subfield. +================= ============================================================== diff --git a/docs/index.rst b/docs/index.rst index 702af4a4..604fb8fb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -31,6 +31,7 @@ This documentation covers the technical usage and API of GLAMkit. architecture/index topics/* + collections/* reference/* contributing/index changelog diff --git a/glamkit_collections/management/commands/__init__.py b/glamkit_collections/management/commands/__init__.py index e69de29b..66a296fd 100644 --- a/glamkit_collections/management/commands/__init__.py +++ b/glamkit_collections/management/commands/__init__.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# −*− coding: UTF−8 −*− + +from optparse import make_option +from django.core.management import BaseCommand +from glamkit_collections.utils.files import getfiles + + +class AnalysisCommand(BaseCommand): + help = "Prints a csv-formatted analysis of paths for all files found at the given paths." + file_regex = r"\.xml$" + + option_list = BaseCommand.option_list + ( + make_option('-r', '--recursive', + action='store_true', + dest='recursive', + default=False, + help="traverse the given folder recursively" + ), + make_option("-l", "--list", + action="store_true", + dest="list_only", + default=False, + help="only list the files that would be analyzed" + ), + make_option("-s", "--samplesize", + action="store", + dest="sample_length", + default=5, + help="provide this many samples of each element's text (default: 5)" + ), + ) + + def analyze(self, paths, sample_length): + raise NotImplementedError + + def handle(self, *args, **options): + try: + path = args[0] + except IndexError: + path = "./" + + paths = getfiles(path=path, regex=self.file_regex, recursive=options['recursive']) + + if options['list_only']: + for p in paths: + print p + else: + self.analyze(paths, sample_length=options['sample_length']) diff --git a/glamkit_collections/management/commands/analyze_marc.py b/glamkit_collections/management/commands/analyze_marc.py new file mode 100755 index 00000000..eed30e21 --- /dev/null +++ b/glamkit_collections/management/commands/analyze_marc.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +# −*− coding: UTF−8 −*− + +from . import AnalysisCommand +from glamkit_collections.utils.marc.analyze import marcanalyze + + +class Command(AnalysisCommand): + help = "Prints a csv-formatted analysis of paths for all XML files found at the given paths." + file_regex = r"\.mrc$" + + def analyze(self, paths, sample_length): + return marcanalyze(paths, sample_length) + diff --git a/glamkit_collections/management/commands/analyze_xml.py b/glamkit_collections/management/commands/analyze_xml.py new file mode 100755 index 00000000..503bc7da --- /dev/null +++ b/glamkit_collections/management/commands/analyze_xml.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +# −*− coding: UTF−8 −*− + +from glamkit_collections.utils.xml.lib.analyze import xmlanalyze +from . import AnalysisCommand + + +class Command(AnalysisCommand): + help = "Prints a csv-formatted analysis of paths for all MARC files found at the given paths." + file_regex = r"\.xml$" + + def analyze(self, paths, sample_length): + return xmlanalyze(paths, sample_length) + diff --git a/glamkit_collections/utils/__init__.py b/glamkit_collections/utils/__init__.py index e69de29b..6ec9cc2f 100644 --- a/glamkit_collections/utils/__init__.py +++ b/glamkit_collections/utils/__init__.py @@ -0,0 +1,7 @@ +# Legacy imports. TODO: deprecate using here + +from measurements import * +from slugs import * +from cleaning import * + + diff --git a/glamkit_collections/utils/cleaning.py b/glamkit_collections/utils/cleaning.py new file mode 100644 index 00000000..0c108ad2 --- /dev/null +++ b/glamkit_collections/utils/cleaning.py @@ -0,0 +1,59 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +import re + +import itertools + + +def ensure_unique(qs, field_name, value, exclude_id=None): + """ + Makes sure that `value` is unique on model.fieldname. And nonempty. + """ + orig = value + if not value: + value = "None" + for x in itertools.count(1): + if not qs.exclude(id=exclude_id).filter(**{field_name: value}).exists(): + break + if orig: + value = '%s-%d' % (orig, x) + else: + value = '%d' % x + + return value + + +def strip_parens(s): + result = re.sub(r'^\(', '', s) + result = re.sub(r'\)$', '', result) + return result + + +def ndashify(s): + """replace ' - ' with an n-dash character""" + return re.sub(r' - ', u'–', unicode(s)) + + +def fix_line_breaks(s): + """ + Convert \r\n and \r to \n chars. Strip any leading or trailing whitespace + on each line. Remove blank lines. + """ + l = s.splitlines() + x = [i.strip() for i in l] + x = [i for i in x if i] # remove blank lines + return "\n".join(x) + + +def strip_line_breaks(s): + """ + Remove \r and \n chars, replacing with a space. Strip leading/trailing + whitespace on each line. Remove blank lines. + """ + return re.sub(r'[\r\n ]+', ' ', s).strip() + + +def remove_url_breaking_chars(s): + r = re.sub(r'[\?#&/]', '', s) + return r.strip() + diff --git a/glamkit_collections/utils/files.py b/glamkit_collections/utils/files.py new file mode 100644 index 00000000..00fdc29e --- /dev/null +++ b/glamkit_collections/utils/files.py @@ -0,0 +1,25 @@ +import os +import re + +def getfiles(path, regex=r"", recursive=True, followlinks=True): + """generates a list of file paths of files in given folder that match a given regex""" + + rex = re.compile(regex) + + if os.path.isfile(path): + p = os.path.abspath(path) + if rex.search(p): + yield p + else: + if recursive: + for root, dirs, files in os.walk(path, followlinks): + for f in files: + p = os.path.abspath(os.path.join(root, f)) + if rex.search(p): + yield p + else: + for f in os.listdir(path): + p = os.path.abspath(os.path.join(path, f)) + if os.path.isfile(p): + if rex.search(p): + yield p \ No newline at end of file diff --git a/glamkit_collections/utils/marc/__init__.py b/glamkit_collections/utils/marc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/glamkit_collections/utils/marc/analyze.py b/glamkit_collections/utils/marc/analyze.py new file mode 100644 index 00000000..993a5121 --- /dev/null +++ b/glamkit_collections/utils/marc/analyze.py @@ -0,0 +1,162 @@ +from pymarc import MARCReader +from unicode_csv import UnicodeWriter +import sys +from pprint import pprint +from tags import meaning + +def _update_analysis(analysis, record_analysis, sample_length): + # merge this record analysis into the global analysis dict + + #first create 'empty' records for all currently known tags + for tag, stats in analysis.iteritems(): + if not record_analysis.has_key(tag): + record_analysis[tag] = { + 'cardinality': 0, + 'samples': set(), + 'subfields': {}, + } + for subfield in stats['subfields']: + record_analysis[tag]['subfields'][subfield] = { + 'cardinality': 0, + 'samples': set(), + } + + + #now merge proper + for tag, stats in record_analysis.iteritems(): + main_analysis = analysis.get(tag, { + 'count': 0, + 'min_cardinality': sys.maxint, + 'max_cardinality': 0, + 'subfields': {}, + 'samples': set(), + }) + main_analysis['count'] += stats['cardinality'] + main_analysis['min_cardinality'] = min(main_analysis['min_cardinality'], stats['cardinality']) + main_analysis['max_cardinality'] = max(main_analysis['max_cardinality'], stats['cardinality']) + + if len(main_analysis['samples']) < sample_length: + #union the two sets, then clip to the sample length - needs to be a list to do this + main_analysis['samples'] = set(list(main_analysis['samples'].union(stats['samples']))[:sample_length]) + + #and now subfields. INCEPTION + for subfield, substats in stats['subfields'].iteritems(): + main_sub_analysis = main_analysis['subfields'].get(subfield, { + 'count': 0, + 'min_cardinality': sys.maxint, + 'max_cardinality': 0, + 'subfields': {}, + 'samples': set(), + }) + main_sub_analysis['count'] += substats['cardinality'] + main_sub_analysis['min_cardinality'] = min(main_sub_analysis['min_cardinality'], substats['cardinality']) + main_sub_analysis['max_cardinality'] = max(main_sub_analysis['max_cardinality'], substats['cardinality']) + + if len(main_sub_analysis['samples']) < sample_length: + #union the two sets, then clip to the sample length - needs to be a list to do this + main_sub_analysis['samples'] = set(list(main_sub_analysis['samples'].union(substats['samples']))[:sample_length]) + + main_analysis['subfields'][subfield] = main_sub_analysis + + analysis[tag] = main_analysis + return analysis + + +def multifile_iter_records(files, sample_length, analysis={}): + n = 0 + for f in files: + if not hasattr(f, 'read'): + f = open(f) + reader = MARCReader(f, to_unicode=True) + for record in reader: + n += 1 + if n % 1000 == 0: + sys.stderr.write("processed %s records\n" % n) + record_analysis = {} + + fields = record.get_fields() + for field in fields: + attrdict = record_analysis.get(field.tag, { + 'cardinality': 0, + 'samples': set(), + 'subfields': {}, + }) + + attrdict['cardinality'] += 1 + + if field.is_control_field(): + if len(attrdict['samples']) < sample_length: + attrdict['samples'].add(field.data) + else: + for subfield in field.get_subfield_tuples(): + key =subfield[0] + sub_attrdict = attrdict['subfields'].get(key, { + 'cardinality': 0, + 'samples': set(), + }) + + sub_attrdict['cardinality'] += 1 + if len(sub_attrdict['samples']) < sample_length: + sub_attrdict['samples'].add(subfield[1]) + + attrdict['subfields'][key] = sub_attrdict + + record_analysis[field.tag] = attrdict + + analysis = _update_analysis(analysis, record_analysis, sample_length) + + return analysis + +def marcanalyze(files, sample_length=5): + """ + returns a csv of marc keys and analyzed values, showing, for example, how many records exist. + + ================= ============================================================== + Column Description + ================= ============================================================== + ``tag`` The 3-digit MARC tag. + ``subfield`` The single-character subfield. + ``tag_meaning`` The English meaning of the tag/subfield, if known. + ``record_count`` The number of records that have at least one of these tags. + ``min_cardinality`` The minimum number of this tag or subfield that each record has. + ``max_cardinality`` The maximum number of this tag or subfield that each record has. + ``samples`` Non-repeating sample values of the values of each tag or subfield. + ================= ============================================================== + + """ + + analysis = multifile_iter_records(files, sample_length = sample_length) + + csv_header=("tag", "subfield", "tag_meaning", "record_count", "min_cardinality", "max_cardinality","samples") + + + writer = UnicodeWriter(sys.stdout) + writer.writerow(csv_header) + + listanalysis = [x for x in analysis.iteritems()] + listanalysis.sort() + + for key, value in listanalysis: + v = [] + v.append(u'"%s"' % key) #tag + v.append(u"") # subfield + v.append(meaning(key)) #tag_meaning + v.append(unicode(value['count'])) #record_count + v.append(unicode(value['min_cardinality'])) + v.append(unicode(value['max_cardinality'])) + v.append(u"\r\r".join(value['samples'])) + writer.writerow(v) + + listanalysis = [x for x in value['subfields'].iteritems()] + listanalysis.sort() + for subfield, value in listanalysis: + v = [] + v.append("") #tag + v.append(subfield) # subfield + v.append(meaning(key, subfield)) #tag_meaning + v.append(unicode(value['count'])) #record_count + v.append(unicode(value['min_cardinality'])) + v.append(unicode(value['max_cardinality'])) + v.append(u"\r\r".join(value['samples'])) + writer.writerow(v) + diff --git a/glamkit_collections/utils/marc/handler.py b/glamkit_collections/utils/marc/handler.py new file mode 100644 index 00000000..1a9a94b1 --- /dev/null +++ b/glamkit_collections/utils/marc/handler.py @@ -0,0 +1,95 @@ +import sys +from pymarc import MARCReader + +DEBUG_ON_IMPORT_SAVE_ERROR = False + +def multifile_iter_records(files): + for f in files: + if not hasattr(f, 'read'): + f = open(f) + reader = MARCReader(f, to_unicode=True) + for record in reader: + yield record + +class BaseHandler(object): + """ + In subclasses, define + + pre_harvest + + a clean(record), which is called with each marc record and returns a dictionary of parameters that could be sent to a model save(). + + post_harvest + """ + + @staticmethod + def get_separated_subfields(record, tag, subfields, separator=" "): + results = [] + for field in record.get_fields(tag): + parts = [] + for s in subfields: + part = field[s] + if part: + parts.append(part.strip()) + + results.append(separator.join(parts).replace(" :", ":").replace(" ;", ";")) + return results + + @staticmethod + def get_formatted_fields(record, tag): + return [field.format_field() for field in record.get_fields(tag)] + + def __init__(self, model, pk="id"): + self.model = model + self.pk = pk + + def process(self, files, post_only=False): + if not post_only: + self.pre_harvest() + #this steps through given files, and calls hadle_elem after the end of each element. + fails = 0 + count = 0 + + for record in multifile_iter_records(files): + # try: + d = self.clean(record) + if d is not None: + try: #update (deleting from a RDBMS updates FK) + q = { self.pk: d[self.pk] } + m = self.model.objects.get(**q) + del d[self.pk] + for k,v in d.items(): + setattr(m, k, v) + m.save() + except self.model.DoesNotExist: + m = self.model(**d) + m.save() + count += 1 + if count % 100 == 0: + print "saved %s items" % count + else: #d is none (fail) + fails += 1 + if fails % 10 == 0: + print "SKIPPED %s items" % fails + # except Exception as e: + # if DEBUG_ON_IMPORT_SAVE_ERROR: + # from pprint import pprint + # pprint(record) + # print "Cleaned data:" + # pprint(d) + # pprint(e) + # import pdb; pdb.set_trace() + # else: + # raise e + + self.post_harvest() + + + def pre_harvest(self): + pass + + def clean(self, record): + return None + + def post_harvest(self): + pass \ No newline at end of file diff --git a/glamkit_collections/utils/marc/tags.py b/glamkit_collections/utils/marc/tags.py new file mode 100644 index 00000000..054b32ce --- /dev/null +++ b/glamkit_collections/utils/marc/tags.py @@ -0,0 +1,109 @@ +TAGS = { + "001": u"Control Number - REQUIRED", + "003": u"Control Number Identifier - REQUIRED", + "005": u"Date and Time of Latest Transaction - REQUIRED", + "008": u"""Fixed - Length Data Elements - REQUIRED\r\n/00 - 05 Date of creation\r\n/06 Type of publication date\r\n/07 - 10 Date 1\r\n/11 - 14 Date 2\r\n/15 - 17 Country of publication code\r\n/35 - 37 Language of publication code""", + "020": u"ISBN", + "022": u"ISSN", + "028": u"Publisher Number", + "040": u"Cataloging Agency - REQUIRED", + "041": u"Language Codes", + "082": u"DDC Call Number", + "100": u"Main Entry - Personal Name", + "100 $a": u"Personal name", + "100 $b": u"Numeration", + "100 $c": u"Words associated with a name", + "100 $d": u"Dates associated with a name", + "110": u"Main Entry - Corporate Name", + "111": u"Main Entry - Meeting Name", + "111 $a": u"Meeting name entry element", + "111 $n": u"Number of meeting", + "111 $d": u"Year of meeting", + "111 $c": u"Place of meeting", + "242": u"Translation of Title", + "245": u"Title Statement", + "245 $a": u"Title proper", + "245 $h": u"General medium designator", + "245 $b": u"Other title information", + "245 $c": u"Statement of responsibility", + "246": u"Variant Title", + "247": u"Former Title", + "250": u"Edition Statement", + "254": u"Musical Presentation Statement", + "255": u"Cartographic Mathematical Data", + "260": u"Imprint - REQUIRED", + "260 $a": u"Place of publication", + "260 $b": u"Publisher/distributor", + "260 $c": u"Date(s of publication)", + "300": u"Physical Description", + "300 $a": u"Extent", + "300 $b": u"Other physical details", + "300 $c": u"Dimensions", + "310": u"Current Publication Frequency", + "362": u"Dates of Publication and/or Sequential Designation", + "440": u"Series Statement", + "440 $a": u"Main series title", + "440 $x": u"ISSN", + "440 $v": u"Volume/sequential designation", + "440 $n": u"and $p Subseries number and title", + "500": u"General Note", + "505": u"Formatted contents note", + "520": u"Summary", + "600": u"Subject - Personal Name", + "600 $a": u"Personal name", + "600 $b": u"Numeration", + "600 $c": u"Words associated with a name", + "600 $d": u"Dates associated with a name", + "600 $x": u"General subdivision", + "600 $2": u"Source of name", + "610": u"Subject - Corporate Name", + "610 $a": u"Corporate name entry element", + "610 $b": u"Subordinate unit", + "610 $x": u"General subdivision", + "610 $2": u"Source of name", + "611": u"Subject - Meeting Name", + "611 $a": u"Meeting name entry element", + "611 $n": u"Number of meeting", + "611 $d": u"Year of meeting", + "611 $c": u"Place of meeting", + "611 $x": u"General subdivision", + "611 $2": u"Source of name", + "630": u"Subject - Uniform Title", + "630 $a": u"Uniform title", + "630 $x": u"General subdivision", + "630 $2": u"Source of name", + "650": u"Subject - Topical Term", + "650 $a": u"Topical term", + "650 $x": u"General subdivision", + "650 $2": u"Source of name", + "651": u"Subject - Geographic Name", + "651 $a": u"Geographic name", + "651 $x": u"General subdivision", + "651 $2": u"Source of name", + "700": u"Personal Name", + "700 $a": u"Personal name", + "700 $b": u"Numeration", + "700 $c": u"Words associated with a name", + "700 $d": u"Dates associated with a name", + "710": u"Corporate Name", + "710 $a": u"Corporate name entry element", + "710 $b": u"Subordinate unit", + "711": u"Meeting Name", + "711 $a": u"Meeting name entry element", + "711 $n": u"Number of meeting", + "711 $d": u"Year of meeting", + "711 $c": u"Place of meeting", + "856": u"Electronic Location and Access", + "923": u"Local Acquisitions Information", + "923 $d": u"Invoice date", + "923 $n": u"Invoice number", + "923 $s": u"Source code", +} + +def meaning(tag, subfield=""): + if subfield: + key = "%s $%s" % (tag, subfield) + else: + key = tag + return TAGS.get(key, u"") + \ No newline at end of file diff --git a/glamkit_collections/utils/marc/unicode_csv.py b/glamkit_collections/utils/marc/unicode_csv.py new file mode 100644 index 00000000..5eb62f02 --- /dev/null +++ b/glamkit_collections/utils/marc/unicode_csv.py @@ -0,0 +1,30 @@ +import csv, codecs, cStringIO + +class UnicodeWriter: + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + # Redirect output to a queue + self.queue = cStringIO.StringIO() + self.writer = csv.writer(self.queue, dialect=dialect, **kwds) + self.stream = f + self.encoder = codecs.getincrementalencoder(encoding)() + + def writerow(self, row): + self.writer.writerow([s.encode("utf-8") for s in row]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + def writerows(self, rows): + for row in rows: + self.writerow(row) \ No newline at end of file diff --git a/glamkit_collections/utils.py b/glamkit_collections/utils/measurements.py similarity index 54% rename from glamkit_collections/utils.py rename to glamkit_collections/utils/measurements.py index 1f779fc1..8341115e 100644 --- a/glamkit_collections/utils.py +++ b/glamkit_collections/utils/measurements.py @@ -1,93 +1,8 @@ #!/usr/bin/python # -*- coding: UTF-8 -*- -import itertools import unittest -from django.utils.functional import allow_lazy -from django.utils.safestring import mark_safe -from django.utils.text import slugify -import re from pyparsing import Optional, nums, Word, ParseException, Group -import six -from unidecode import unidecode -def wikipedia_slugify(value, do_unidecode=False): - """ - Converts to ASCII via unidecode. - Converts spaces to underscore. - - Removes characters that - aren't alphanumerics, underscores, or hyphens. - - Preserve case. - - Also strips leading and trailing whitespace. - """ - if do_unidecode: - value = unidecode(value) - value = value.strip() - return mark_safe(re.sub('[\s/#\?:@]+', '_', value)) -wikipedia_slugify = allow_lazy(wikipedia_slugify, six.text_type) - - -def alt_slugify(value): - if value and value.strip(): - return re.sub('[-_]', '', slugify(unicode(unidecode(value)))) - else: - return "" -alt_slugify = allow_lazy(alt_slugify, six.text_type) - - -def ensure_unique(qs, field_name, value, exclude_id=None): - """ - Makes sure that `value` is unique on model.fieldname. And nonempty. - """ - orig = value - if not value: - value = "None" - for x in itertools.count(1): - if not qs.exclude(id=exclude_id).filter(**{field_name: value}).exists(): - break - if orig: - value = '%s-%d' % (orig, x) - else: - value = '%d' % x - - return value - - -def strip_parens(s): - result = re.sub(r'^\(', '', s) - result = re.sub(r'\)$', '', result) - return result - - -def ndashify(s): - """replace ' - ' with an n-dash character""" - return re.sub(r' - ', u'–', unicode(s)) - - -def fix_line_breaks(s): - """ - Convert \r\n and \r to \n chars. Strip any leading or trailing whitespace - on each line. Remove blank lines. - """ - l = s.splitlines() - x = [i.strip() for i in l] - x = [i for i in x if i] # remove blank lines - return "\n".join(x) - - -def strip_line_breaks(s): - """ - Remove \r and \n chars, replacing with a space. Strip leading/trailing - whitespace on each line. Remove blank lines. - """ - return re.sub(r'[\r\n ]+', ' ', s).strip() - - -def remove_url_breaking_chars(s): - r = re.sub(r'[\?#&/]', '', s) - return r.strip() # oh yay we need parsers for imperial units. Thanks Obama. INTEGER = Word(nums).setParseAction(lambda t: [int(t[0])]) diff --git a/glamkit_collections/utils/slugs.py b/glamkit_collections/utils/slugs.py new file mode 100644 index 00000000..4f940a4a --- /dev/null +++ b/glamkit_collections/utils/slugs.py @@ -0,0 +1,42 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +import itertools +from django.utils.functional import allow_lazy +from django.utils.safestring import mark_safe +from django.utils.text import slugify +import re +import six +from unidecode import unidecode + + +def wikipedia_slugify(value, do_unidecode=False): + """ + Converts to ASCII via unidecode. + Converts spaces to underscore. + + Removes characters that + aren't alphanumerics, underscores, or hyphens. + + Preserve case. + + Also strips leading and trailing whitespace. + """ + if do_unidecode: + value = unidecode(value) + value = value.strip() + return mark_safe(re.sub('[\s/#\?:@]+', '_', value)) +wikipedia_slugify = allow_lazy(wikipedia_slugify, six.text_type) + + +def alt_slugify(value): + """ + More extreme version of slugify, unidecoding, and removing hyphens. + + Useful for fallback slug values. + """ + if value and value.strip(): + return re.sub('[-_]', '', slugify(unicode(unidecode(value)))) + else: + return "" +alt_slugify = allow_lazy(alt_slugify, six.text_type) diff --git a/glamkit_collections/utils/xml/__init__.py b/glamkit_collections/utils/xml/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/glamkit_collections/utils/xml/handler.py b/glamkit_collections/utils/xml/handler.py new file mode 100644 index 00000000..a30527c2 --- /dev/null +++ b/glamkit_collections/utils/xml/handler.py @@ -0,0 +1,79 @@ +from lxml import etree +from lxml.cssselect import CSSSelector +from lib.constants import BREAK, DO_NOT_DISCARD +import sys +from lib.iterxml import multifile_iter_elems +from lib.files import File + +def node(selector, kallable, **kwargs): + return { + 'selector': selector, + 'callable': kallable, + 'kwargs': kwargs, + } + +class BaseHandler(object): + """ + In subclasses, define nodes, mapping of CSS-style selectors to callables: + + nodes = ( + node('topElement > Organisation', org_callable, **kwargs), + node('topElement > Person[id="person-345"]', person_345_callable, **kwargs), + node('topElement > Work', work_callable, **kwargs), + ... + ) + + Only the value of the *final* matching callable is returned to the parent, so normally you want to do saving. + + Define pre_ and post_harvest handlers if you need. The post_harvest handler in particular should operate on as few records as possible, in order to make incremental changes feasible. (pre_ and post_harvest handlers quite often call similarly-named handlers in callable classes). + + Then call .process(files). Every time a selector is matched (in top-to-bottom order), the elem, and kwargs, will be passed to the corresponding callable. + + Each callable should: + 1) handle the contents of XML elem. + 2) Log for itself that the handle has happened (e.g. for post-batch-handle cleaning, etc). + 3) Return constants.DO_NOT_DISCARD if the memory taken by this elem is NOT to be freed up after the callable is called - ie another handler should run on the same data. + """ + + def _make_handler_list(self): + self.HANDLERS = [] + for node in self.handle_nodes: + #CSSSelector() compiles into XPath + namespaces = getattr(self, 'namespaces', {}) + try: + self.HANDLERS.append( + ( + CSSSelector(node['selector'], namespaces=namespaces), + node['callable'], node['kwargs'] + ) + ) + except Exception as e: + from pprint import pprint as pp; import pdb; pdb.set_trace() + + def __init__(self, *args, **kwargs): + self._make_handler_list() + + def handle_elem(self, elem): + root = elem.getroottree().getroot() + #Do the cleaning + x = DO_NOT_DISCARD + for selector, kallable, kwargs in self.HANDLERS: + # get the elems that match this selector (as loaded into the root so far) + if elem in selector(root): + x = kallable(elem, **kwargs) + if x is not None and x & BREAK: + break + return x + + def process(self, files, post_only=False, encoding=None): + if not post_only: + self.pre_harvest() + #this steps through given files, and calls hadle_elem after the end of each element. + multifile_iter_elems(files, callable_start=None, callable_end=self.handle_elem, encoding=encoding) + self.post_harvest() + + def pre_harvest(self): + pass + + def post_harvest(self): + pass \ No newline at end of file diff --git a/glamkit_collections/utils/xml/lib/__init__.py b/glamkit_collections/utils/xml/lib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/glamkit_collections/utils/xml/lib/analyze.py b/glamkit_collections/utils/xml/lib/analyze.py new file mode 100644 index 00000000..a374b1b0 --- /dev/null +++ b/glamkit_collections/utils/xml/lib/analyze.py @@ -0,0 +1,89 @@ +from pprint import pprint +import sys +import csv +csv_header=("path", "min_cardinality", "max_cardinality","samples", "attributes") +from iterxml import multifile_iter_elems +from utils import remove_ns, get_path + +def _get_children_from_analysis(path, analysis): + keys = analysis.keys() + for k in keys: + if k.startswith(path) and k != path: # k is a decendent, but not necessarily a child. + if len(k[len(path):].split('.')) == 2: #it's a child ['', 'child'] + yield k + + +def analyze_start(elem, analysis, sample_length): + path = get_path(elem) + + if not analysis.has_key(path): + analysis[path] = { + 'cardinality_current': 0, + 'cardinality_min': sys.maxint, + 'cardinality_max': 0, + 'values': set(), + 'attributes': {}, + } + + analysis[path]['cardinality_current'] += 1 + #maintain max + if analysis[path]['cardinality_current'] > analysis[path]['cardinality_max']: + analysis[path]['cardinality_max'] = analysis[path]['cardinality_current'] + + #attributes + for attr in elem.keys(): + av = analysis[path]['attributes'].get(attr, set()) + if len(av) < sample_length: + av.add(elem.get(attr)) + analysis[path]['attributes'][attr] = av + + +def analyze_end(elem, analysis, sample_length): + path = get_path(elem) + + # maintain min + for c in _get_children_from_analysis(path, analysis): + if analysis[c]['cardinality_current'] < analysis[c]['cardinality_min']: + analysis[c]['cardinality_min'] = analysis[c]['cardinality_current'] + analysis[c]['cardinality_current'] = 0 + # sample values + if len(analysis[path]['values']) < sample_length: + try: + v = elem.text.strip() + if v: + analysis[path]['values'].add(v) + except AttributeError: + pass + +def _attributestring(attrdict): + ss = [] + for key, value in attrdict.iteritems(): + s = "%s = (\"%s\")" % (remove_ns(key), "\", \"".join(value)) + ss.append(s) + + return "\r\n\r\n".join(ss) + +def xmlanalyze(files, sample_length=5): + """ returns a csv of xml paths and analyzed values, showing, for example, how many records exist for every path in an xml file """ + + analysis = {} + + multifile_iter_elems(files, analyze_start, analyze_end, sample_length=sample_length, analysis=analysis) + + writer = csv.writer(sys.stdout) + writer.writerow(csv_header) + + listanalysis = [x for x in analysis.iteritems()] + listanalysis.sort() + + for key, value in listanalysis: + v = [] + v.append(key) #path + if value['cardinality_min'] == sys.maxint: #top-level nodes do this. + value['cardinality_min'] = value['cardinality_max'] + v.append(value['cardinality_min']) + v.append(value['cardinality_max']) + v.append("\r\r".join(value['values'])) + v.append(_attributestring(value['attributes'])) + + writer.writerow(v) diff --git a/glamkit_collections/utils/xml/lib/constants.py b/glamkit_collections/utils/xml/lib/constants.py new file mode 100644 index 00000000..811c2b7c --- /dev/null +++ b/glamkit_collections/utils/xml/lib/constants.py @@ -0,0 +1,3 @@ +DISCARD_AFTER = 1 #None will work too +DO_NOT_DISCARD = 2 +BREAK = 4 diff --git a/glamkit_collections/utils/xml/lib/files.py b/glamkit_collections/utils/xml/lib/files.py new file mode 100644 index 00000000..73ebf652 --- /dev/null +++ b/glamkit_collections/utils/xml/lib/files.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- + +# A file-like object that cleans up each line, if necessary. +# This is useful if your xml file declares utf-8 and yet there are iso values in the actual file +# http://stackoverflow.com/questions/2352840/parsing-broken-xml-with-lxml-etree-iterparse + +class File(object): + def __init__(self, filename): + self.f = open(filename, 'rt') + + def read(self, size=None): + return self.f.next() #.replace('\x1e', '').replace('some other bad character...' ,'') + \ No newline at end of file diff --git a/glamkit_collections/utils/xml/lib/iterxml.py b/glamkit_collections/utils/xml/lib/iterxml.py new file mode 100644 index 00000000..f0789c65 --- /dev/null +++ b/glamkit_collections/utils/xml/lib/iterxml.py @@ -0,0 +1,63 @@ +""" +A pattern for iterating through a list of XML files, calling a callable at the start and end of each elem, and freeing up the memory used by that elem endwards. + +If callable returns constants.DO_NOT_DISCARD, then the memory is NOT freed up. + +Usage: + + iter_elems(path_to_xml, callable_start, callable_end, start_args, start_kwargs, end_args, end_kwargs) + +or, for several XML files: + + multifile_iter_elems(paths_to_xmls, callable_start, callable_end, start_args, start_kwargs, end_args, end_kwargs) + +""" +from lxml import etree +from constants import BREAK, DISCARD_AFTER +import sys +from files import File + +SKIP_UNTIL = 0 + +def _fast_iter(context, callable_start, callable_end, *args, **kwargs): + _iter_count = kwargs.pop('_iter_count', 0) + for event, elem in context: + if _iter_count >= SKIP_UNTIL: + if event=="start": + status = callable_start(elem, *args, **kwargs) + elif event=="end": + status = callable_end(elem, *args, **kwargs) + if status is not None and status & BREAK: + break + if event=="end": + if status is None or status & DISCARD_AFTER: + elem.clear() + while elem.getprevious() is not None: #delete parent if I am the last item. + del elem.getparent()[0] + _iter_count += 1 + if _iter_count % 10000 == 0: + sys.stderr.write("processing %s elements...\n" % _iter_count) + # Work around bug #1185701 by bailing out after the end of the document root. + if elem.getparent() is None: + break + del context + return _iter_count + + +def iter_elems(xml_file, callable_start, callable_end, encoding=None, *args, **kwargs): + kwargs['_iter_count'] = kwargs.get('_iter_count', 0) + events = [] + if callable_start is not None: + events.append('start') + if callable_end is not None: + events.append('end') + context = etree.iterparse(xml_file, events=events, encoding=encoding) + return _fast_iter(context, callable_start, callable_end, *args, **kwargs) + +def multifile_iter_elems(xml_files, callable_start, callable_end, encoding=None, *args, **kwargs): + _iter_count = 0 + + for f in xml_files: + sys.stderr.write("===\nprocessing file %s\n===\n" % f) + kwargs['_iter_count'] = _iter_count + _iter_count = iter_elems(f, callable_start, callable_end, encoding, *args, **kwargs) \ No newline at end of file diff --git a/glamkit_collections/utils/xml/lib/utils.py b/glamkit_collections/utils/xml/lib/utils.py new file mode 100644 index 00000000..8cf31493 --- /dev/null +++ b/glamkit_collections/utils/xml/lib/utils.py @@ -0,0 +1,16 @@ +import re + +NS_RE = re.compile(r"\{.*?\}") + +def remove_ns(tag): + return re.sub(NS_RE, "", tag) + +def get_path(elem, separator="."): + an = elem.iterancestors() + anlist = [remove_ns(e.tag) for e in an] + anlist.reverse() + anlist += [remove_ns(elem.tag)] + + return separator.join(anlist) + +camelcase_to_underscore = lambda str: re.sub('(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))', '_\\1', str).lower().strip('_') \ No newline at end of file diff --git a/glamkit_collections/utils/xml/lib/xml2dict.py b/glamkit_collections/utils/xml/lib/xml2dict.py new file mode 100644 index 00000000..1b5d2321 --- /dev/null +++ b/glamkit_collections/utils/xml/lib/xml2dict.py @@ -0,0 +1,27 @@ +from utils import remove_ns, camelcase_to_underscore + +def xml2dict(tag): + r = {} + + #value + # tag.text is None, for empty tags. + if tag.text is not None: + v = tag.text.strip() + r['_value'] = v + + #attributes + if tag.keys(): + r['_attributes'] = {} + for k in tag.keys(): + _k = remove_ns(k) + _k = camelcase_to_underscore(_k) + r['_attributes'][_k] = tag.get(k) + + for child in tag.getchildren(): + ctag = remove_ns(child.tag) + ctag = camelcase_to_underscore(ctag) + #assuming every child is potentially a list + l = r.get(ctag, []) + l.append(xml2dict(child)) + r[ctag] = l + return r \ No newline at end of file diff --git a/glamkit_collections/utils/xml/processors/__init__.py b/glamkit_collections/utils/xml/processors/__init__.py new file mode 100644 index 00000000..f0e73fa6 --- /dev/null +++ b/glamkit_collections/utils/xml/processors/__init__.py @@ -0,0 +1,2 @@ +from base import * +from mongo import * \ No newline at end of file diff --git a/glamkit_collections/utils/xml/processors/base.py b/glamkit_collections/utils/xml/processors/base.py new file mode 100644 index 00000000..d6b67330 --- /dev/null +++ b/glamkit_collections/utils/xml/processors/base.py @@ -0,0 +1,14 @@ +import sys + +__all__ = ['BaseProcessor', 'DebugProcessor'] + +class BaseProcessor(object): + def __call__(self, tag): + raise NotImplemented("Subclasses of BaseSaver need to implement __call__().") + +class DebugProcessor(BaseProcessor): + def __init__(self, *args, **kwargs): + pass + + def __call__(self, tag): + sys.stderr.write("Fake saving %s\n" % (tag, )) diff --git a/glamkit_collections/utils/xml/processors/django.py b/glamkit_collections/utils/xml/processors/django.py new file mode 100644 index 00000000..8d1db1f0 --- /dev/null +++ b/glamkit_collections/utils/xml/processors/django.py @@ -0,0 +1,57 @@ +from base import BaseProcessor +from ..lib.xml2dict import xml2dict +import sys +__all__ = ['DjangoSaver',] + + +DEBUG_ON_IMPORT_SAVE_ERROR = True + +class DjangoSaver(BaseProcessor): + """ + Convert XML to dict, clean dict, then put that dict into a django model instance and save it. + + You'll want to override 'clean', and return the kwargs for the model instance creation. + + We force the use of 'id' for PK. + + """ + def __init__(self, model): + self.model = model + self.count = 0 + self.fails = 0 + + def clean(self, attribs): + return attribs + + def __call__(self, tag): + u = xml2dict(tag) + d = self.clean(u) + + try: + if d is not None: + try: #update (deleting from a RDBMS updates FK) + m = self.model.objects.get(pk=d['id']) + del d['id'] + for k,v in d.items(): + setattr(m, k, v) + m.save() + except self.model.DoesNotExist: + m = self.model(**d) + m.save() + self.count += 1 + if self.count % 100 == 0: + print "saved %s items" % self.count + else: #d is none (fail) + self.fails += 1 + if self.fails % 10 == 0: + print "SKIPPED %s items" % self.fails + + except Exception as e: + if DEBUG_ON_IMPORT_SAVE_ERROR: + from pprint import pprint + pprint(e) + pprint(u) + pprint(d) + import pdb; pdb.set_trace() + else: + raise e \ No newline at end of file diff --git a/glamkit_collections/utils/xml/processors/mongo.py b/glamkit_collections/utils/xml/processors/mongo.py new file mode 100644 index 00000000..4e08e264 --- /dev/null +++ b/glamkit_collections/utils/xml/processors/mongo.py @@ -0,0 +1,56 @@ +from base import BaseProcessor +from ..lib.xml2dict import xml2dict +import sys +__all__ = ['MongoSaver',] + + +DEBUG_ON_IMPORT_SAVE_ERROR = True + +class MongoSaver(BaseProcessor): + """ + Convert XML to dict, clean dict, then put that dict into a mongo document instance and save it. + + You'll want to override 'clean', and return the kwargs for the document instance creation. + + We force the use of 'id' for PK. + """ + def __init__(self, model): + self.model = model + self.count = 0 + self.fails = 0 + + def clean(self, attribs): + return attribs + + def __call__(self, tag): + u = xml2dict(tag) + d = self.clean(u) + + try: + if d is not None: + # try: + # self.model.objects.get(id=d['id']) + # except self.model.DoesNotExist: + # pass + m = self.model(**d) + m.save() + self.count += 1 + if self.count % 100 == 0: + print "saved %s items" % self.count + else: #d is none (fail) + self.fails += 1 + if self.fails % 10 == 0: + print "SKIPPED %s items" % self.fails + + except Exception as e: + if DEBUG_ON_IMPORT_SAVE_ERROR: + from pprint import pprint + print "Exception e" + pprint(e) + print "XML2Dict u" + pprint(u) + print "Cleaned dictionary d" + pprint(d) + import pdb; pdb.set_trace() + else: + raise e \ No newline at end of file diff --git a/setup.py b/setup.py index d83a4012..8d7efbdd 100644 --- a/setup.py +++ b/setup.py @@ -169,7 +169,13 @@ def find_packages(*paths): 'colormath==2.1.1', # Disable as it's devpi-only # 'colorweave==0.1+0.ce27c83b4e06a8185531538fa11c18c5ea2c1aba.ixc', - ] + ], + 'import_marc': [ + 'pymarc', + ], + 'import_xml': [ + 'lxml', + ], }, setup_requires=['setuptools_scm'], )