Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Partial DOIs #101

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions allofplos/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@

from . import get_corpus_dir
from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX,
URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path)
from .plos_regex import validate_doi
URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path,
partial_to_doi)
from .plos_regex import validate_doi, find_valid_partial_dois
from .elements import (parse_article_date, get_contrib_info,
Journal, License, match_contribs_to_dicts)
from .utils import dedent
Expand Down Expand Up @@ -124,6 +125,11 @@ def doi(self):
"""
return self._doi

@property
def partial_doi(self):
"""Convert a DOI to a partial DOI."""
return self.doi.lstrip('10.1371/').replace('journal.', '')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why aren't you using doi_to_partial?


@property
def text_viewer(self):
"""Command line application for viewing text to be used with
Expand Down Expand Up @@ -1336,3 +1342,15 @@ def from_filename(cls, filename):
else:
directory = None
return cls(filename_to_doi(filename), directory=directory)

@classmethod
def from_partial_doi(cls, partial_doi, directory=None):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't you validate the partial doi first?

"""Initiate an article object using a partial DOI.
Uses regex to make sure it's a valid partial DOI.
Used for internal PLOS methods.
"""
if directory is None:
directory = get_corpus_dir()
doi = partial_to_doi(partial_doi)

return cls(doi, directory=directory)
8 changes: 6 additions & 2 deletions allofplos/corpus/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from itertools import islice

from .. import get_corpus_dir, Article
from ..transformations import filename_to_doi, doi_to_path
from ..transformations import filename_to_doi, doi_to_path, partial_to_doi
from ..plos_regex import validate_doi, validate_partial_doi


class Corpus:
Expand Down Expand Up @@ -36,13 +37,16 @@ def __iter__(self):
return (article for article in self.random_article_generator)

def __getitem__(self, key):

if isinstance(key, int):
return Article(self.dois[key], directory=self.directory)
elif isinstance(key, slice):
return (Article(doi, directory=self.directory)
for doi in self.dois[key])
elif key not in self.dois:
if partial_to_doi(key) in self.dois:
return Article.from_partial_doi(key, directory=self.directory)
elif validate_partial_doi(key):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't it be safer to validate first?

key = partial_to_doi(key)
path= doi_to_path(key, directory=self.directory)
raise IndexError(("You attempted get {doi} from "
"the corpus at \n{directory}. \n"
Expand Down
21 changes: 21 additions & 0 deletions allofplos/plos_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match)
full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}"
"|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}")
partial_doi_regex_search = re.compile(r"p[a-zA-Z]{3}\.[\d]{7}"
"|annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}")
partial_doi_regex_match = re.compile(r"^p[a-zA-Z]{3}\.[\d]{7}$"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this is just the same string with ^+your_str+$, am I correct? and the last part of it is almost the same as the last part of the full_doi_regex_search?

Why not make this based on a single string that you then combine as needed to make the search and match?

I'm not sure you need to define search vs. match separately anyway (in this case at least). I ask below… but it'd be a lot easier to know whether that is the case if you tested the regexes directly.

r"|^annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$")
currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents)
file_regex_match = re.compile(regex_file_search+r"\.xml")
BASE_URL = 'http://journals.plos.org/plosone/article/file?id='
Expand All @@ -41,6 +45,14 @@ def validate_doi(doi):
return bool(full_doi_regex_match.search(doi))


def validate_partial_doi(partial_doi):
"""For an individual string, tests whether the full string is in a valid PLOS partial DOI format.
Example: 'pbio.2000777' is True, but '10.1371/journal.pbio.2000777' is False
:return: True if string is in valid PLOS partial DOI format; False if not
"""
return bool(partial_doi_regex_match.search(partial_doi))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is the match regex used to search?



def validate_filename(filename):
"""
For an individual string, tests whether the full string is in a valid article file. This can take two forms.
Expand Down Expand Up @@ -75,6 +87,15 @@ def find_valid_dois(doi):
return full_doi_regex_search.findall(doi)


def find_valid_partial_dois(doi):
"""
For an individual string, searches for any valid partial PLOS DOIs within it and returns them
Used for finding DOIs in PLOS job tickets
:return: list of valid PLOS partial DOIs contained within string
"""
return partial_doi_regex_search.findall(doi)


def show_invalid_dois(doi_list):
"""
Checks to see whether a list of PLOS DOIs follow the correct format. Used mainly to determine
Expand Down
52 changes: 52 additions & 0 deletions allofplos/tests/test_partial_dois.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from .. import Corpus, Article, starterdir
from ..plos_regex import validate_partial_doi, validate_doi
from ..transformations import partial_to_doi, doi_to_partial

import pytest


@pytest.fixture
def corpus():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should consider the corpus a module/session level scoped fixture since this is basically the same as in some of the other tests. You don't need to do it for this PR but it's worth considering

return Corpus(starterdir, seed=1000)


@pytest.fixture
def test_article():
Copy link
Collaborator

@mpacer mpacer Apr 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally fixtures don't have test prefixed to them (see corpus above)

return Article('10.1371/journal.pone.0040259', directory=starterdir)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could use the corpus & doi fixtures directly here, since fixtures can be used in other fixtures:

@pytest.fixture(scope='module')
def article(corpus, doi):
    return corpus[doi]



@pytest.fixture
def test_doi():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as with test_article shouldn't be prefixed with test, better is to just use def doi():

return '10.1371/journal.pone.0040259'


@pytest.fixture
def test_partial_doi():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be called partial_doi

return 'pone.0040259'


def test_partial_doi_regex(test_partial_doi):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you test the regexes directly as well?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually this test is confusingly named… its not testing the doi regexes it's testing the validate_partial_doi function.

assert validate_partial_doi(test_partial_doi)
assert not validate_partial_doi(' pone.0040259')
Copy link
Collaborator

@mpacer mpacer Apr 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're going to have a test like this it's better to make explicit what the manipulation is " " + partial_doi would do that.

assert not validate_partial_doi('pone.0040259 ')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above… reuse partial_doipartial_doi + " "



def test_partial_doi_transform(test_doi, test_partial_doi):
partial_doi = doi_to_partial(test_doi)
assert partial_doi == test_partial_doi


def test_doi_transform(test_partial_doi, test_doi):
doi = partial_to_doi(test_partial_doi)
assert validate_doi(doi)
assert doi == test_doi


def test_partial_doi_method_article(test_partial_doi, test_article):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically I would prefer this to be in its own test_article.py for article functionality but I don't think that exists.

article = Article.from_partial_doi(test_partial_doi, directory=starterdir)
assert article == test_article


def test_partial_doi_method_corpus(corpus, test_article, test_partial_doi):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be in the test_corpus.py testing module.

article = corpus[test_partial_doi]
assert article == test_article
18 changes: 17 additions & 1 deletion allofplos/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from . import get_corpus_dir

from .plos_regex import validate_filename, validate_doi
from .plos_regex import validate_filename, validate_doi, validate_partial_doi
from .elements import Journal

# URL bases for PLOS's Solr instances, that index PLOS articles
Expand Down Expand Up @@ -183,6 +183,22 @@ def doi_to_path(doi, directory=None):
article_file = os.path.join(directory, doi.lstrip(PREFIX) + SUFFIX_LOWER)
return article_file

def partial_to_doi(partial_doi):
"""Convert a partial DOI into a DOI."""
if validate_partial_doi(partial_doi) is False:
raise Exception("Invalid format for PLOS partial DOI: {}".format(partial_doi))
if partial_doi.startswith('annotation'):
doi = PREFIX + partial_doi
else:
doi = ''.join([PREFIX, 'journal.', partial_doi])
return doi

def doi_to_partial(doi):
"""Convert a DOI into a partial DOI."""
if validate_doi(doi) is False:
raise Exception("Invalid format for PLOS DOI: {}".format(doi))
return doi.lstrip('10.1371/').replace('journal.', '')


def convert_country(country):
"""
Expand Down