-
Notifications
You must be signed in to change notification settings - Fork 17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Partial DOIs #101
base: master
Are you sure you want to change the base?
Partial DOIs #101
Changes from all commits
c8a474e
dee96f4
8ca7b20
30ee364
322e7d0
2148ccf
ee71745
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,8 +9,9 @@ | |
|
||
from . import get_corpus_dir | ||
from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX, | ||
URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path) | ||
from .plos_regex import validate_doi | ||
URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path, | ||
partial_to_doi) | ||
from .plos_regex import validate_doi, find_valid_partial_dois | ||
from .elements import (parse_article_date, get_contrib_info, | ||
Journal, License, match_contribs_to_dicts) | ||
from .utils import dedent | ||
|
@@ -124,6 +125,11 @@ def doi(self): | |
""" | ||
return self._doi | ||
|
||
@property | ||
def partial_doi(self): | ||
"""Convert a DOI to a partial DOI.""" | ||
return self.doi.lstrip('10.1371/').replace('journal.', '') | ||
|
||
@property | ||
def text_viewer(self): | ||
"""Command line application for viewing text to be used with | ||
|
@@ -1336,3 +1342,15 @@ def from_filename(cls, filename): | |
else: | ||
directory = None | ||
return cls(filename_to_doi(filename), directory=directory) | ||
|
||
@classmethod | ||
def from_partial_doi(cls, partial_doi, directory=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't you validate the partial doi first? |
||
"""Initiate an article object using a partial DOI. | ||
Uses regex to make sure it's a valid partial DOI. | ||
Used for internal PLOS methods. | ||
""" | ||
if directory is None: | ||
directory = get_corpus_dir() | ||
doi = partial_to_doi(partial_doi) | ||
|
||
return cls(doi, directory=directory) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,8 @@ | |
from itertools import islice | ||
|
||
from .. import get_corpus_dir, Article | ||
from ..transformations import filename_to_doi, doi_to_path | ||
from ..transformations import filename_to_doi, doi_to_path, partial_to_doi | ||
from ..plos_regex import validate_doi, validate_partial_doi | ||
|
||
|
||
class Corpus: | ||
|
@@ -36,13 +37,16 @@ def __iter__(self): | |
return (article for article in self.random_article_generator) | ||
|
||
def __getitem__(self, key): | ||
|
||
if isinstance(key, int): | ||
return Article(self.dois[key], directory=self.directory) | ||
elif isinstance(key, slice): | ||
return (Article(doi, directory=self.directory) | ||
for doi in self.dois[key]) | ||
elif key not in self.dois: | ||
if partial_to_doi(key) in self.dois: | ||
return Article.from_partial_doi(key, directory=self.directory) | ||
elif validate_partial_doi(key): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wouldn't it be safer to validate first? |
||
key = partial_to_doi(key) | ||
path= doi_to_path(key, directory=self.directory) | ||
raise IndexError(("You attempted get {doi} from " | ||
"the corpus at \n{directory}. \n" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,10 @@ | |
full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match) | ||
full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}" | ||
"|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") | ||
partial_doi_regex_search = re.compile(r"p[a-zA-Z]{3}\.[\d]{7}" | ||
"|annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") | ||
partial_doi_regex_match = re.compile(r"^p[a-zA-Z]{3}\.[\d]{7}$" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like this is just the same string with Why not make this based on a single string that you then combine as needed to make the search and match? I'm not sure you need to define search vs. match separately anyway (in this case at least). I ask below… but it'd be a lot easier to know whether that is the case if you tested the regexes directly. |
||
r"|^annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$") | ||
currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents) | ||
file_regex_match = re.compile(regex_file_search+r"\.xml") | ||
BASE_URL = 'http://journals.plos.org/plosone/article/file?id=' | ||
|
@@ -41,6 +45,14 @@ def validate_doi(doi): | |
return bool(full_doi_regex_match.search(doi)) | ||
|
||
|
||
def validate_partial_doi(partial_doi): | ||
"""For an individual string, tests whether the full string is in a valid PLOS partial DOI format. | ||
Example: 'pbio.2000777' is True, but '10.1371/journal.pbio.2000777' is False | ||
:return: True if string is in valid PLOS partial DOI format; False if not | ||
""" | ||
return bool(partial_doi_regex_match.search(partial_doi)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is the match regex used to search? |
||
|
||
|
||
def validate_filename(filename): | ||
""" | ||
For an individual string, tests whether the full string is in a valid article file. This can take two forms. | ||
|
@@ -75,6 +87,15 @@ def find_valid_dois(doi): | |
return full_doi_regex_search.findall(doi) | ||
|
||
|
||
def find_valid_partial_dois(doi): | ||
""" | ||
For an individual string, searches for any valid partial PLOS DOIs within it and returns them | ||
Used for finding DOIs in PLOS job tickets | ||
:return: list of valid PLOS partial DOIs contained within string | ||
""" | ||
return partial_doi_regex_search.findall(doi) | ||
|
||
|
||
def show_invalid_dois(doi_list): | ||
""" | ||
Checks to see whether a list of PLOS DOIs follow the correct format. Used mainly to determine | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from .. import Corpus, Article, starterdir | ||
from ..plos_regex import validate_partial_doi, validate_doi | ||
from ..transformations import partial_to_doi, doi_to_partial | ||
|
||
import pytest | ||
|
||
|
||
@pytest.fixture | ||
def corpus(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should consider the corpus a module/session level scoped fixture since this is basically the same as in some of the other tests. You don't need to do it for this PR but it's worth considering |
||
return Corpus(starterdir, seed=1000) | ||
|
||
|
||
@pytest.fixture | ||
def test_article(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generally fixtures don't have test prefixed to them (see corpus above) |
||
return Article('10.1371/journal.pone.0040259', directory=starterdir) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could use the corpus & doi fixtures directly here, since fixtures can be used in other fixtures:
|
||
|
||
|
||
@pytest.fixture | ||
def test_doi(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as with |
||
return '10.1371/journal.pone.0040259' | ||
|
||
|
||
@pytest.fixture | ||
def test_partial_doi(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be called |
||
return 'pone.0040259' | ||
|
||
|
||
def test_partial_doi_regex(test_partial_doi): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you test the regexes directly as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually this test is confusingly named… its not testing the doi regexes it's testing the validate_partial_doi function. |
||
assert validate_partial_doi(test_partial_doi) | ||
assert not validate_partial_doi(' pone.0040259') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you're going to have a test like this it's better to make explicit what the manipulation is |
||
assert not validate_partial_doi('pone.0040259 ') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above… reuse |
||
|
||
|
||
def test_partial_doi_transform(test_doi, test_partial_doi): | ||
partial_doi = doi_to_partial(test_doi) | ||
assert partial_doi == test_partial_doi | ||
|
||
|
||
def test_doi_transform(test_partial_doi, test_doi): | ||
doi = partial_to_doi(test_partial_doi) | ||
assert validate_doi(doi) | ||
assert doi == test_doi | ||
|
||
|
||
def test_partial_doi_method_article(test_partial_doi, test_article): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. technically I would prefer this to be in its own |
||
article = Article.from_partial_doi(test_partial_doi, directory=starterdir) | ||
assert article == test_article | ||
|
||
|
||
def test_partial_doi_method_corpus(corpus, test_article, test_partial_doi): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be in the |
||
article = corpus[test_partial_doi] | ||
assert article == test_article |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why aren't you using
doi_to_partial
?