Skip to content

Commit

Permalink
Merge pull request #91 from mpacer/pretty_articles
Browse files Browse the repository at this point in the history
Pretty articles
  • Loading branch information
eseiver authored Mar 28, 2018
2 parents a2f2b75 + 1c1349f commit d0fff94
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 35 deletions.
2 changes: 1 addition & 1 deletion allofplos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ def get_corpus_dir():
# import after creating global variables that they may rely upon
# (e.g., corpusdir)

from .article_class import Article
from .article import Article
from .corpus import Corpus
89 changes: 59 additions & 30 deletions allofplos/article_class.py → allofplos/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@

from . import get_corpus_dir
from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX,
URL_SUFFIX, plos_page_dict, doi_url)
URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path)
from .plos_regex import validate_doi
from .elements import (parse_article_date, get_contrib_info,
Journal, License, match_contribs_to_dicts)
from .utils import dedent


class Article():
class Article:
"""The primary object of a PLOS article, initialized by a valid PLOS DOI.
"""
Expand Down Expand Up @@ -46,6 +47,62 @@ def __eq__(self, other):
dir_eq = self.directory == other.directory
return doi_eq and dir_eq

def __str__(self, exclude_refs=True):
"""Output when you print an article object on the command line.
For parsing and viewing the XML of a local article. Should not be used for hashing
Excludes <back> element (including references list) for easier viewing
:param exclude_refs: remove references from the article tree (eases print viewing)
"""
parser = et.XMLParser(remove_blank_text=True)
tree = et.parse(self.filename, parser)
if exclude_refs:
root = tree.getroot()
back = tree.xpath('./back')
if back:
root.remove(back[0])
local_xml = et.tostring(tree,
method='xml',
encoding='unicode',
pretty_print=True)
return local_xml

def __repr__(self):
"""Value of an article object when you call it directly on the command line.
Shows the DOI and title of the article
:returns: DOI and title
:rtype: {str}
"""
out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title)
return out


def _repr_html_(self):
"""Nice display for Jupyter notebook"""

titlestyle = 'display:inline-flex;'
titletextstyle = 'margin-left:.5em;'
titlelink = ('<span style="{titlestyle}"><a href="{url}">'
'<em>{title}</em></a></span>').format(
url=self.page,
title=self.title,
titlestyle=titlestyle+titletextstyle,
)

doilink = '<span><a href="{url}"><code>{doi}</code></a></span>'.format(
url=self.doi_link(),
doi=self.doi,
)
out = dedent("""<div>
<span style="{titlestyle}">Title: {titlelink}</span></br>
<span>DOI: <span>{doilink}
</div>
""").format(doilink=doilink, titlelink=titlelink, titlestyle=titlestyle)

return out


def reset_memoized_attrs(self):
"""Reset attributes to None when instantiating a new article object.
Expand Down Expand Up @@ -111,34 +168,6 @@ def doi(self, d):
self.reset_memoized_attrs()
self._doi = d

def __str__(self, exclude_refs=True):
"""Output when you print an article object on the command line.
For parsing and viewing the XML of a local article. Should not be used for hashing
Excludes <back> element (including references list) for easier viewing
:param exclude_refs: remove references from the article tree (eases print viewing)
"""
parser = et.XMLParser(remove_blank_text=True)
tree = et.parse(self.filename, parser)
if exclude_refs:
root = tree.getroot()
back = tree.xpath('./back')
root.remove(back[0])
local_xml = et.tostring(tree,
method='xml',
encoding='unicode',
pretty_print=True)
return local_xml

def __repr__(self):
"""Value of an article object when you call it directly on the command line.
Shows the DOI and title of the article
:returns: DOI and title
:rtype: {str}
"""
out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title)
return out

def doi_link(self):
"""The link of the DOI, which redirects to the journal URL."""
Expand Down
2 changes: 1 addition & 1 deletion allofplos/corpus/plos_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

from ..plos_regex import validate_doi
from ..transformations import (BASE_URL_API, filename_to_doi, doi_to_path, doi_to_url)
from ..article_class import Article
from ..article import Article
from .gdrive import (download_file_from_google_drive, get_zip_metadata, unzip_articles,
ZIP_ID, LOCAL_ZIP, LOCAL_TEST_ZIP, TEST_ZIP_ID, min_files_for_valid_corpus)

Expand Down
2 changes: 1 addition & 1 deletion allofplos/makedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .corpus import Corpus
from .transformations import filename_to_doi, convert_country
from . import starterdir
from .article_class import Article
from .article import Article

journal_title_dict = {
'PLOS ONE': 'PLOS ONE',
Expand Down
2 changes: 1 addition & 1 deletion allofplos/samples/corpus_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ..plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list,
download_updated_xml, get_all_solr_dois,
download_check_and_move)
from ..article_class import Article
from ..article import Article

counter = collections.Counter
pmcdir = "pmc_articles"
Expand Down
2 changes: 1 addition & 1 deletion allofplos/tests/test_corpus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from . import TESTDATADIR
from .. import Corpus, starterdir
from ..article_class import Article
from ..article import Article
from ..corpus import listdir_nohidden

import random
Expand Down
27 changes: 27 additions & 0 deletions allofplos/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import textwrap

def dedent(text):
"""Equivalent of textwrap.dedent that ignores unindented first line.
This means it will still dedent strings like:
'''foo
is a bar
'''
For use in wrap_paragraphs.
Taken from https://github.com/ipython/ipython_genutils/text.py
"""

if text.startswith('\n'):
# text starts with blank line, don't ignore the first line
return textwrap.dedent(text)

# split first line
splits = text.split('\n',1)
if len(splits) == 1:
# only one line
return textwrap.dedent(text)

first, rest = splits
# dedent everything but the first line
rest = textwrap.dedent(rest)
return '\n'.join([first, rest])

0 comments on commit d0fff94

Please sign in to comment.