Merge pull request #91 from mpacer/pretty_articles

Pretty articles
PLOS · Mar 28, 2018 · d0fff94 · d0fff94
2 parents a2f2b75 + 1c1349f
commit d0fff94
Show file tree

Hide file tree

Showing 7 changed files with 91 additions and 35 deletions.
diff --git a/allofplos/__init__.py b/allofplos/__init__.py
@@ -30,5 +30,5 @@ def get_corpus_dir():
 # import after creating global variables that they may rely upon
 # (e.g., corpusdir)
 
-from .article_class import Article
+from .article import Article
 from .corpus import Corpus
diff --git a/allofplos/article_class.py → allofplos/article.py b/allofplos/article_class.py → allofplos/article.py
@@ -9,13 +9,14 @@
 
 from . import get_corpus_dir
 from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX,
-                              URL_SUFFIX, plos_page_dict, doi_url)
+                              URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path)
 from .plos_regex import validate_doi
 from .elements import (parse_article_date, get_contrib_info,
                        Journal, License, match_contribs_to_dicts)
+from .utils import dedent
 
 
-class Article():
+class Article:
     """The primary object of a PLOS article, initialized by a valid PLOS DOI.
 
     """
@@ -46,6 +47,62 @@ def __eq__(self, other):
         dir_eq = self.directory == other.directory
         return doi_eq and dir_eq
 
+    def __str__(self, exclude_refs=True):
+        """Output when you print an article object on the command line.
+
+        For parsing and viewing the XML of a local article. Should not be used for hashing
+        Excludes <back> element (including references list) for easier viewing
+        :param exclude_refs: remove references from the article tree (eases print viewing)
+        """
+        parser = et.XMLParser(remove_blank_text=True)
+        tree = et.parse(self.filename, parser)
+        if exclude_refs:
+            root = tree.getroot()
+            back = tree.xpath('./back')
+            if back:
+                root.remove(back[0])
+        local_xml = et.tostring(tree,
+                                method='xml',
+                                encoding='unicode',
+                                pretty_print=True)
+        return local_xml
+
+    def __repr__(self):
+        """Value of an article object when you call it directly on the command line.
+
+        Shows the DOI and title of the article
+        :returns: DOI and title
+        :rtype: {str}
+        """
+        out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title)
+        return out
+
+
+    def _repr_html_(self):
+        """Nice display for Jupyter notebook"""
+
+        titlestyle = 'display:inline-flex;'
+        titletextstyle = 'margin-left:.5em;'
+        titlelink = ('<span style="{titlestyle}"><a href="{url}">'
+                     '<em>{title}</em></a></span>').format(
+                        url=self.page,
+                        title=self.title,
+                        titlestyle=titlestyle+titletextstyle,
+                    )
+
+        doilink = '<span><a href="{url}"><code>{doi}</code></a></span>'.format(
+                        url=self.doi_link(), 
+                        doi=self.doi,
+                  )
+        out = dedent("""<div> 
+        <span style="{titlestyle}">Title: {titlelink}</span></br>
+        <span>DOI: <span>{doilink} 
+        </div>
+        """).format(doilink=doilink, titlelink=titlelink, titlestyle=titlestyle)
+
+        return out
+
+
     def reset_memoized_attrs(self):
         """Reset attributes to None when instantiating a new article object.
 
@@ -111,34 +168,6 @@ def doi(self, d):
         self.reset_memoized_attrs()
         self._doi = d
 
-    def __str__(self, exclude_refs=True):
-        """Output when you print an article object on the command line.
-
-        For parsing and viewing the XML of a local article. Should not be used for hashing
-        Excludes <back> element (including references list) for easier viewing
-        :param exclude_refs: remove references from the article tree (eases print viewing)
-        """
-        parser = et.XMLParser(remove_blank_text=True)
-        tree = et.parse(self.filename, parser)
-        if exclude_refs:
-            root = tree.getroot()
-            back = tree.xpath('./back')
-            root.remove(back[0])
-        local_xml = et.tostring(tree,
-                                method='xml',
-                                encoding='unicode',
-                                pretty_print=True)
-        return local_xml
-
-    def __repr__(self):
-        """Value of an article object when you call it directly on the command line.
-
-        Shows the DOI and title of the article
-        :returns: DOI and title
-        :rtype: {str}
-        """
-        out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title)
-        return out
 
     def doi_link(self):
         """The link of the DOI, which redirects to the journal URL."""

diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py
@@ -37,7 +37,7 @@
 
 from ..plos_regex import validate_doi
 from ..transformations import (BASE_URL_API, filename_to_doi, doi_to_path, doi_to_url)
-from ..article_class import Article
+from ..article import Article
 from .gdrive import (download_file_from_google_drive, get_zip_metadata, unzip_articles,
                      ZIP_ID, LOCAL_ZIP, LOCAL_TEST_ZIP, TEST_ZIP_ID, min_files_for_valid_corpus)
 

diff --git a/allofplos/makedb.py b/allofplos/makedb.py
@@ -20,7 +20,7 @@
 from .corpus import Corpus
 from .transformations import filename_to_doi, convert_country
 from . import starterdir
-from .article_class import Article
+from .article import Article
 
 journal_title_dict = {
     'PLOS ONE': 'PLOS ONE',

diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py
@@ -22,7 +22,7 @@
 from ..plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list,
                            download_updated_xml, get_all_solr_dois,
                            download_check_and_move)
-from ..article_class import Article
+from ..article import Article
 
 counter = collections.Counter
 pmcdir = "pmc_articles"

diff --git a/allofplos/tests/test_corpus.py b/allofplos/tests/test_corpus.py
@@ -1,6 +1,6 @@
 from . import TESTDATADIR
 from .. import Corpus, starterdir
-from ..article_class import Article
+from ..article import Article
 from ..corpus import listdir_nohidden
 
 import random

diff --git a/allofplos/utils.py b/allofplos/utils.py
@@ -0,0 +1,27 @@
+import textwrap
+
+def dedent(text):
+    """Equivalent of textwrap.dedent that ignores unindented first line.
+    This means it will still dedent strings like:
+    '''foo
+    is a bar
+    '''
+    For use in wrap_paragraphs.
+    
+    Taken from https://github.com/ipython/ipython_genutils/text.py
+    """
+
+    if text.startswith('\n'):
+        # text starts with blank line, don't ignore the first line
+        return textwrap.dedent(text)
+
+    # split first line
+    splits = text.split('\n',1)
+    if len(splits) == 1:
+        # only one line
+        return textwrap.dedent(text)
+
+    first, rest = splits
+    # dedent everything but the first line
+    rest = textwrap.dedent(rest)
+    return '\n'.join([first, rest])