forked from lalinsky/musicbrainz-bot
-
Notifications
You must be signed in to change notification settings - Fork 13
/
convert_cc_links.py
161 lines (147 loc) · 7.08 KB
/
convert_cc_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding: utf-8 -*-
import re
from optparse import OptionParser
from collections import defaultdict
import sqlalchemy
import mechanize
import editing
from editing import MusicBrainzClient
from utils import out
from mbbot.utils.pidfile import PIDFile
import config as cfg
'''
CREATE TABLE bot_cc_removed (
gid uuid NOT NULL,
url text NOT NULL,
processed timestamp with time zone DEFAULT now(),
CONSTRAINT bot_cc_removed_pkey PRIMARY KEY (gid,url)
);
'''
engine = sqlalchemy.create_engine(cfg.MB_DB)
db = engine.connect()
db.execute('SET search_path TO musicbrainz')
mb = MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE)
query_releases_with_cc = '''
SELECT r.id, r.gid, r.artist_credit, release_name.name, url.url, l_ru.id
FROM release r
JOIN release_name ON r.name = release_name.id
JOIN l_release_url l_ru ON r.id = l_ru.entity0
JOIN link l ON l_ru.link = l.id
JOIN url ON url.id = l_ru.entity1
WHERE l.link_type = 84 AND l_ru.edits_pending = 0
GROUP BY r.id, r.gid, r.artist_credit, release_name.name, url.url, l_ru.id
ORDER BY r.artist_credit
'''
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.set_debug_redirects(False)
browser.set_debug_http(False)
html_escape_table = {
u'&': u'&',
u''': u'"',
u''': u''',
u'>': u'>',
u'<': u'<',
}
def html_escape(text):
return u''.join(html_escape_table.get(c, c) for c in text)
cc_removed = set((gid, url) for gid, url in db.execute('''SELECT gid, url FROM bot_cc_removed'''))
def main(verbose=False):
releases = [(r, gid, ac, name, url, rel_id) for r, gid, ac, name, url, rel_id in db.execute(query_releases_with_cc)]
count = len(releases)
for i, (r, gid, ac, name, url, rel_id) in enumerate(releases):
original_url = url
#if not re.match(r'http://([^/]+\.)?(bandcamp\.com|archive\.org|magnatune\.com)/', url):
# continue
if verbose:
out(u'%d/%d - %.2f%%' % (i, count, i * 100.0 / count))
out(u'%s - http://musicbrainz.org/release/%s - %s' % (name, gid, url))
if re.match(r'http://([^/]+\.)?magnatune\.com/', url):
license_urls = set([u'http://creativecommons.org/licenses/by-nc-sa/1.0/'])
else:
try:
browser.open(url.encode('utf-8'))
except:
continue
if not browser.response().info()['Content-type'].startswith('text'):
if verbose:
out(u'not a text document, aborting!')
continue
page = browser.response().read()
license_urls = set(re.findall(r'(http://creativecommons.org/licenses/[0-9A-Za-z/+.-]+)', page))
if len(license_urls) == 0:
url = u'http://web.archive.org/' + url
if verbose:
out(u'no license url found, trying archive.org!')
if u'jamendo.com' not in url:
continue
try:
browser.open(url.encode('utf-8'))
except:
continue
page = browser.response().read()
license_urls = set(re.findall(r'(http://creativecommons.org/licenses/[0-9A-Za-z/+.-]+)', page))
if len (license_urls) == 0 and '<p class="impatient"><a href="http://web.archive.org' in page:
if verbose:
out(u'no license url found, trying archive.org AGAIN!')
m = re.search(r'<p class="impatient"><a href="([^"]+)">Impatient\?</a></p>', page)
if m and m.group(1):
url = m.group(1)
try:
browser.open(url.encode('utf-8'))
except:
continue
page = browser.response().read()
license_urls = set(re.findall(r'(http://creativecommons.org/licenses/[0-9A-Za-z/+.-]+)', page))
if len(license_urls) > 1:
if verbose:
out(u'more than one license url found, aborting!')
continue
if len(license_urls) == 0:
if verbose:
out(u'no license url found, aborting!')
continue
if name.lower().encode('utf-8') not in page.lower() and html_escape(name.lower()).encode('utf-8') not in page.lower() and re.sub(r'( +e\.p\.| +ep|, volume [0-9]+)', u'', name.lower()).encode('utf-8') not in page.lower():
if verbose:
out(u'album name not found in page, aborting!')
continue
license_url_raw = list(license_urls)[0]
license_url = re.sub(r'((legalcode|deed)((\.|-)[a-z]+)?)$', u'', license_url_raw)
if verbose:
out(u'%s' % license_url)
link_id = 75
text = u'The [Creative_Commons_Licensed_Download_Relationship_Type] is obsoleted by a new [License_Relationship_Type].'
text += u' All CC links will be replaced by a Free/Paid Download Relationship and a License URL.\n'
if re.match(r'http://([^/]+\.)?magnatune\.com/', original_url):
text += u'“All music files available to Magnatune members are licensed under a Creative Commons by-nc-sa v1.0 license.”\n'
text += u'http://magnatune.com/info/cc_licensed'
link_id = 74
#elif re.match(r'http://([^/]+\.)?hhgroups\.com/', original_url):
else:
if re.match(r'http://([^/]+\.)?bandcamp\.com/', original_url):
if not re.match(r'http://([^/]+\.)?bandcamp\.com/album/', original_url) and ('>%s</h2>' % name.lower().encode('utf-8')) not in page.lower():
if verbose:
out(u'not the bandcamp page for this album, aborting!')
continue
if '>Free Download</a>' not in page:
if '>Buy Now</a>' in page:
link_id = 74
else:
if verbose:
out(u'could not determine kind of download (free/paid), aborting!')
continue
text += u'I’m converting this relationship because I’ve found a link to %s in the linked page %s.' % (license_url_raw, url)
mb.add_url('release', gid, 301, license_url, text, auto=False)
if not mb.edit_relationship(rel_id, 'release', 'url', 84, link_id, {'license.0': []}, {}, {}, text, auto=False):
if (gid, original_url) not in cc_removed:
text = u'Download and License relationship are already set, so this relationship is not necessary anymore.'
mb.remove_relationship(rel_id, 'release', 'url', text)
db.execute("INSERT INTO bot_cc_removed (gid,url) VALUES (%s,%s)", (gid, original_url))
cc_removed.add((gid, original_url))
if __name__ == '__main__':
parser = OptionParser()
parser.add_option('-v', '--verbose', action='store_true', default=False,
help='be more verbose')
(options, args) = parser.parse_args()
with PIDFile('/tmp/mbbot_convert_cc_links.pid'):
main(options.verbose)