forked from RaphaelWimmer/open-access-media-importer
-
Notifications
You must be signed in to change notification settings - Fork 8
/
oa-put
executable file
·129 lines (110 loc) · 4.67 KB
/
oa-put
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from os import path
from sys import argv, stderr
from time import sleep
from urllib2 import urlparse
import csv
# csv.field_size_limit must be reset according to
# <http://lethain.com/handling-very-large-csv-and-xml-files-in-python/>
csv.field_size_limit(999999999)
from helpers import config, efetch, filename_from_url, mediawiki, template
from model import session, setup_all, create_all, set_source, \
Article, Journal, SupplementaryMaterial
try:
action = argv[1]
target = argv[2]
except IndexError: # no arguments given
stderr.write("""
oa-put – Open Access Importer upload operations
usage: oa-put upload-media [source]
""")
exit(1)
try:
assert(action in ['upload-media'])
except AssertionError: # invalid action
stderr.write("Unknown action “%s”.\n" % action)
exit(2)
try:
exec "from sources import %s as source_module" % target
except ImportError: # invalid source
stderr.write("Unknown source “%s”.\n" % target)
exit(3)
set_source(target)
setup_all(True)
if action == 'upload-media':
media_refined_directory = config.get_media_refined_source_path(target)
materials = SupplementaryMaterial.query.filter_by(
converted=True,
uploaded=False
).all()
for material in materials:
filename = filename_from_url(material.url) + '.ogg'
media_refined_path = path.join(media_refined_directory, filename)
if (path.getsize(media_refined_path) == 0):
material.converted=False
continue
if mediawiki.is_uploaded(material):
stderr.write("Skipping “%s”, already exists at %s.\n" % (
media_refined_path.encode('utf-8'),
mediawiki.get_wiki_name()
))
material.uploaded=True
continue
article_doi = material.article.doi
article_pmid = efetch.get_pmid_from_doi(article_doi)
article_pmcid = efetch.get_pmcid_from_doi(article_doi)
authors = material.article.contrib_authors
article_title = material.article.title
journal_title = material.article.journal.title
article_year = material.article.year
article_month = material.article.month
article_day = material.article.day
article_url = material.article.url
license_url = material.article.license_url
rights_holder = material.article.copyright_holder
label = material.label
title = material.title
caption = material.caption
mimetype = material.mimetype
material_url = material.url
categories = [category.name for category in material.article.categories]
if article_pmid is not None:
categories += efetch.get_categories_from_pmid(article_pmid)
#TODO: file extension should be adapted for other file formats
url_path = urlparse.urlsplit(material.url).path
source_filename = url_path.split('/')[-1]
assert(mimetype in ('audio', 'video'))
if mimetype == 'audio':
extension = 'oga'
elif mimetype == 'video':
extension = 'ogv'
wiki_filename = path.splitext(source_filename)[0] + '.' + extension
if article_title is not None:
dirty_prefix = article_title
dirty_prefix = dirty_prefix.replace('\n', '')
dirty_prefix = ' '.join(dirty_prefix.split()) # remove multiple spaces
forbidden_chars = u"""?,;:^/!<>"`'±#[]|{}ʻʾʿ᾿῾‘’“”"""
for character in forbidden_chars:
dirty_prefix = dirty_prefix.replace(character, '')
# prefix is first hundred chars of title sans forbidden characters
prefix = '-'.join(dirty_prefix[:100].split(' '))
# if original title is longer than cleaned up title, remove last word
if len(dirty_prefix) > len(prefix):
prefix = '-'.join(prefix.split('-')[:-1])
if prefix[-1] != '-':
prefix += '-'
wiki_filename = prefix + wiki_filename
page_template = template.page(article_doi, article_pmid, \
article_pmcid, authors, article_title, journal_title, \
article_year, article_month, article_day, article_url, \
license_url, label, caption, title, categories, mimetype, \
material_url)
mediawiki.upload(media_refined_path, wiki_filename, page_template)
stderr.write("“%s” uploaded to <%s>.\n" % (
media_refined_path.encode('utf-8'),
config.api_url.encode('utf-8')
))
material.uploaded = True
session.commit()
sleep(10) # 6 uploads per minute