diff --git a/VariantValidator/modules/variant.py b/VariantValidator/modules/variant.py index 6e895975..c34612a7 100644 --- a/VariantValidator/modules/variant.py +++ b/VariantValidator/modules/variant.py @@ -194,6 +194,7 @@ def output_dict(self, test=False): if test is True: try: del self.stable_gene_ids['ensembl_gene_id'] + del self.stable_gene_ids['ccds_ids'] except KeyError: pass dict_out = { diff --git a/VariantValidator/modules/vvMixinConverters.py b/VariantValidator/modules/vvMixinConverters.py index c4812450..48417da2 100644 --- a/VariantValidator/modules/vvMixinConverters.py +++ b/VariantValidator/modules/vvMixinConverters.py @@ -9,7 +9,8 @@ from Bio import Entrez, SeqIO from . import utils as fn -from vvhgvs.exceptions import HGVSError, HGVSDataNotAvailableError, HGVSUnsupportedOperationError +from vvhgvs.exceptions import HGVSError, HGVSDataNotAvailableError, HGVSUnsupportedOperationError, \ + HGVSInvalidVariantError logger = logging.getLogger(__name__) @@ -494,6 +495,24 @@ def search_through_options(hgvs_genomic, seqtype, chr_num_val, final=False): # This will only happen if the variant is flanking the gap but is # not inside the gap logger.info('Variant is on the flank of a genomic gap but not within the gap') + + # Test on the flank and if so, return + + # Logic, normalize the c. variant and if a substitution (cannot normalize) then direct map + # Currently believe that sub.n is the only variant type which fits. ins can normalize + # and may also be a dup! + try: + norm_stored_c = hn.normalize(stored_hgvs_c) + if norm_stored_c.posedit.edit.type == 'sub': + flank_hgvs_genomic = self.vm.t_to_g(norm_stored_c, genomic_gap_variant.ac) + self.vr.validate(flank_hgvs_genomic) + return flank_hgvs_genomic + + # Will occur if the variant still overlaps the gap / is in the gap + except HGVSInvalidVariantError: + pass + + # If test fails, continue old processing gap_start = genomic_gap_variant.posedit.pos.start.base - 1 gap_end = genomic_gap_variant.posedit.pos.end.base + 1 genomic_gap_variant.posedit.pos.start.base = gap_start @@ -1159,6 +1178,23 @@ def myvm_t_to_g(self, hgvs_c, alt_chr, no_norm_evm, hn): # This will only happen if the variant is flanking the gap but is # not inside the gap logger.info('Variant is on the flank of a genomic gap but not within the gap') + + # Test definately on the flank and if so, return + # Logic, normalize the c. variant and if a substitution (cannot normalize) then direct map + # Currently believe that sub.n is the only variant type which fits. ins can normalize + # and may also be a dup! + try: + norm_stored_c = hn.normalize(stored_hgvs_c) + if norm_stored_c.posedit.edit.type == 'sub': + flank_hgvs_genomic = self.vm.t_to_g(norm_stored_c, genomic_gap_variant.ac) + self.vr.validate(flank_hgvs_genomic) + return flank_hgvs_genomic + + # Will occur if the variant still overlaps the gap / is in the gap + except HGVSInvalidVariantError: + pass + + # If test fails, continue old processing gap_start = genomic_gap_variant.posedit.pos.start.base - 1 gap_end = genomic_gap_variant.posedit.pos.end.base + 1 genomic_gap_variant.posedit.pos.start.base = gap_start @@ -2155,7 +2191,7 @@ def chr_to_rsg(self, hgvs_genomic, hn): """ # Covert chromosomal HGVS description to RefSeqGene """ - # print 'chr_to_rsg triggered' + # 'chr_to_rsg triggered' hgvs_genomic = hn.normalize(hgvs_genomic) # split the description # Accessions @@ -2358,7 +2394,6 @@ def rsg_to_chr(self, hgvs_refseqgene, primary_assembly, hn): new_ref = match[1] hgvs_genomic.posedit.edit.ref = new_ref error = 'true' - # # print str(e) + '\n3.' data = {'hgvs_genomic': str(hgvs_genomic), 'gene': gene, 'valid': str(error)} else: data = {'hgvs_genomic': str(hgvs_genomic), 'gene': gene, 'valid': 'true'} diff --git a/VariantValidator/modules/vvMixinCore.py b/VariantValidator/modules/vvMixinCore.py index 700905cf..d0ec8330 100644 --- a/VariantValidator/modules/vvMixinCore.py +++ b/VariantValidator/modules/vvMixinCore.py @@ -175,15 +175,26 @@ def validate(self, batch_variant, selected_assembly, select_transcripts, transcr # INITIAL USER INPUT FORMATTING invalid = my_variant.format_quibble() if invalid: - if re.search(r'\w+:[gcnmrp]', my_variant.quibble) and not \ + if re.search(r'\w+:[gcnmrp],', my_variant.quibble): + error = 'Variant description ' + my_variant.quibble + ' contained the , character between '\ + ' and in the expected pattern :. and ' \ + 'has been auto-corrected' + my_variant.quibble = my_variant.quibble.replace(',', '.') + my_variant.warnings.append(error) + logger.warning(error) + pass + elif re.search(r'\w+:[gcnmrp]', my_variant.quibble) and not \ re.search(r'\w+:[gcnmrp]\.', my_variant.quibble): error = 'Variant description ' + my_variant.quibble + ' lacks the . character between ' \ ' and in the expected pattern :.' + my_variant.warnings.append(error) + logger.warning(error) + continue else: error = 'Variant description ' + my_variant.quibble + ' is not in an accepted format' - my_variant.warnings.append(error) - logger.warning(error) - continue + my_variant.warnings.append(error) + logger.warning(error) + continue formatted_variant = my_variant.quibble stash_input = my_variant.quibble @@ -214,8 +225,9 @@ def validate(self, batch_variant, selected_assembly, select_transcripts, transcr except vvhgvs.exceptions.HGVSError as e: # Look for T not U! posedit = formatted_variant.split(':')[-1] - if 'T' in posedit: - e = 'The IUPAC RNA alphabet dictates that RNA variants must use the character u in place of t' + if 'T' in posedit and "r." in posedit: + e = 'The IUPAC RNA alphabet dictates that RNA variants must use the character u in ' \ + 'place of t' my_variant.warnings.append(str(e)) logger.warning(str(e)) continue @@ -257,7 +269,7 @@ def validate(self, batch_variant, selected_assembly, select_transcripts, transcr my_variant.warnings.append(str(trap_ens_in) + ' automapped to equivalent RefSeq transcript ' + my_variant.quibble) logger.info(str(trap_ens_in) + ' automapped to equivalent RefSeq ' - 'transcript ' + my_variant.quibble) + 'transcript ' + my_variant.quibble) logger.debug("HVGS acceptance test passed") # Check whether supported genome build is requested for non g. descriptions @@ -701,7 +713,14 @@ def validate(self, batch_variant, selected_assembly, select_transcripts, transcr stable_gene_ids['ucsc_id'] = gene_stable_info[5] stable_gene_ids['omim_id'] = json.loads(gene_stable_info[6]) # stable_gene_ids['vega_id'] = gene_stable_info[7] - # stable_gene_ids['ccds_ids'] = gene_stable_info[8] + + # reformat ccds return into a Python list + my_ccds = gene_stable_info[8].replace('[', '') + my_ccds = my_ccds.replace(']', '') + my_ccds = my_ccds.replace('"','') + ccds_list = my_ccds.split() + stable_gene_ids['ccds_ids'] = ccds_list + except IndexError as e: logger.debug("Except pass, %s", e) diff --git a/environment.yml b/environment.yml index 3db91464..ef3025fa 100644 --- a/environment.yml +++ b/environment.yml @@ -13,6 +13,6 @@ dependencies: - configparser>=3.5.0 - requests - pip: - - git+https://github.com/openvar/vv_hgvs@master#egg=vvhgvs + - git+https://github.com/openvar/vv_hgvs@1.2.5.vv1#egg=vvhgvs - biotools>=0.3.0 - biopython diff --git a/setup.py b/setup.py index 4f0c4d6e..f2b60e5a 100644 --- a/setup.py +++ b/setup.py @@ -69,8 +69,8 @@ # removed "biopython==1.74", "requests", - # "vvhgvs", - "vvhgvs @ git+https://github.com/openvar/vv_hgvs.git@master#egg=vvhgvs", + "mysql-connector-python", + "vvhgvs @ git+https://github.com/openvar/vv_hgvs.git@1.2.5.vv1#egg=vvhgvs", ], # dependency_links=[ # "git+https://github.com/openvar/vv_hgvs.git@master#egg=vvhgvs-1.0.0", diff --git a/tests/test_inputs.py b/tests/test_inputs.py index e648d66f..c4fab1e6 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -25,10 +25,10 @@ def test_variant1(self): assert results['NM_015120.4:c.35T>C']['hgvs_lrg_transcript_variant'] == 'LRG_741t1:c.35T>C' assert results['NM_015120.4:c.35T>C']['hgvs_lrg_variant'] == 'LRG_741:g.5146T>C' self.assertCountEqual(results['NM_015120.4:c.35T>C']['alt_genomic_loci'], []) - assert results['NM_015120.4:c.35T>C']['primary_assembly_loci']['hg19'] == {'hgvs_genomic_description': 'NC_000002.11:g.73613031delinsCGGA', 'vcf': {'chr': 'chr2', 'pos': '73613031', 'ref': 'T', 'alt': 'CGGA'}} - assert results['NM_015120.4:c.35T>C']['primary_assembly_loci']['hg38'] == {'hgvs_genomic_description': 'NC_000002.12:g.73385903delinsCGGA', 'vcf': {'chr': 'chr2', 'pos': '73385903', 'ref': 'T', 'alt': 'CGGA'}} - assert results['NM_015120.4:c.35T>C']['primary_assembly_loci']['grch37'] == {'hgvs_genomic_description': 'NC_000002.11:g.73613031delinsCGGA', 'vcf': {'chr': '2', 'pos': '73613031', 'ref': 'T', 'alt': 'CGGA'}} - assert results['NM_015120.4:c.35T>C']['primary_assembly_loci']['grch38'] == {'hgvs_genomic_description': 'NC_000002.12:g.73385903delinsCGGA', 'vcf': {'chr': '2', 'pos': '73385903', 'ref': 'T', 'alt': 'CGGA'}} + assert results['NM_015120.4:c.35T>C']['primary_assembly_loci']['hg19'] == {'hgvs_genomic_description': 'NC_000002.11:g.73613031T>C', 'vcf': {'chr': 'chr2', 'pos': '73613031', 'ref': 'T', 'alt': 'C'}} + assert results['NM_015120.4:c.35T>C']['primary_assembly_loci']['hg38'] == {'hgvs_genomic_description': 'NC_000002.12:g.73385903T>C', 'vcf': {'chr': 'chr2', 'pos': '73385903', 'ref': 'T', 'alt': 'C'}} + assert results['NM_015120.4:c.35T>C']['primary_assembly_loci']['grch37'] == {'hgvs_genomic_description': 'NC_000002.11:g.73613031T>C', 'vcf': {'chr': '2', 'pos': '73613031', 'ref': 'T', 'alt': 'C'}} + assert results['NM_015120.4:c.35T>C']['primary_assembly_loci']['grch38'] == {'hgvs_genomic_description': 'NC_000002.12:g.73385903T>C', 'vcf': {'chr': '2', 'pos': '73385903', 'ref': 'T', 'alt': 'C'}} assert results['NM_015120.4:c.35T>C']['reference_sequence_records'] == {'transcript': 'https://www.ncbi.nlm.nih.gov/nuccore/NM_015120.4', 'protein': 'https://www.ncbi.nlm.nih.gov/nuccore/NP_055935.4', 'refseqgene': 'https://www.ncbi.nlm.nih.gov/nuccore/NG_011690.1', 'lrg': 'http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_741.xml'} def test_variant2(self): @@ -49,10 +49,10 @@ def test_variant2(self): assert results['NM_015120.4:c.39G>C']['hgvs_lrg_transcript_variant'] == 'LRG_741t1:c.39G>C' assert results['NM_015120.4:c.39G>C']['hgvs_lrg_variant'] == 'LRG_741:g.5150G>C' self.assertCountEqual(results['NM_015120.4:c.39G>C']['alt_genomic_loci'], []) - assert results['NM_015120.4:c.39G>C']['primary_assembly_loci']['hg19'] == {'hgvs_genomic_description': 'NC_000002.11:g.73613034_73613035insCGA', 'vcf': {'chr': 'chr2', 'pos': '73613032', 'ref': 'G', 'alt': 'GGAC'}} - assert results['NM_015120.4:c.39G>C']['primary_assembly_loci']['hg38'] == {'hgvs_genomic_description': 'NC_000002.12:g.73385906_73385907insCGA', 'vcf': {'chr': 'chr2', 'pos': '73385904', 'ref': 'G', 'alt': 'GGAC'}} - assert results['NM_015120.4:c.39G>C']['primary_assembly_loci']['grch37'] == {'hgvs_genomic_description': 'NC_000002.11:g.73613034_73613035insCGA', 'vcf': {'chr': '2', 'pos': '73613032', 'ref': 'G', 'alt': 'GGAC'}} - assert results['NM_015120.4:c.39G>C']['primary_assembly_loci']['grch38'] == {'hgvs_genomic_description': 'NC_000002.12:g.73385906_73385907insCGA', 'vcf': {'chr': '2', 'pos': '73385904', 'ref': 'G', 'alt': 'GGAC'}} + assert results['NM_015120.4:c.39G>C']['primary_assembly_loci']['hg19'] == {'hgvs_genomic_description': 'NC_000002.11:g.73613032G>C', 'vcf': {'chr': 'chr2', 'pos': '73613032', 'ref': 'G', 'alt': 'C'}} + assert results['NM_015120.4:c.39G>C']['primary_assembly_loci']['hg38'] == {'hgvs_genomic_description': 'NC_000002.12:g.73385904G>C', 'vcf': {'chr': 'chr2', 'pos': '73385904', 'ref': 'G', 'alt': 'C'}} + assert results['NM_015120.4:c.39G>C']['primary_assembly_loci']['grch37'] == {'hgvs_genomic_description': 'NC_000002.11:g.73613032G>C', 'vcf': {'chr': '2', 'pos': '73613032', 'ref': 'G', 'alt': 'C'}} + assert results['NM_015120.4:c.39G>C']['primary_assembly_loci']['grch38'] == {'hgvs_genomic_description': 'NC_000002.12:g.73385904G>C', 'vcf': {'chr': '2', 'pos': '73385904', 'ref': 'G', 'alt': 'C'}} assert results['NM_015120.4:c.39G>C']['reference_sequence_records'] == {'transcript': 'https://www.ncbi.nlm.nih.gov/nuccore/NM_015120.4', 'protein': 'https://www.ncbi.nlm.nih.gov/nuccore/NP_055935.4', 'refseqgene': 'https://www.ncbi.nlm.nih.gov/nuccore/NG_011690.1', 'lrg': 'http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_741.xml'} def test_variant3(self): @@ -2286,10 +2286,12 @@ def test_variant76(self): assert results['NM_032790.3:c.126C>A']['hgvs_predicted_protein_consequence'] == {'tlr': 'NP_116179.2(LRG_93p1):p.(Ala42=)', 'slr': 'NP_116179.2:p.(A42=)'} assert results['NM_032790.3:c.126C>A']['hgvs_lrg_transcript_variant'] == 'LRG_93t1:c.126C>A' assert results['NM_032790.3:c.126C>A']['hgvs_lrg_variant'] == 'LRG_93:g.5299C>A' - self.assertCountEqual(results['NM_032790.3:c.126C>A']['alt_genomic_loci'], [{'grch37': {'hgvs_genomic_description': 'NW_004504303.2:g.302871_302876del', 'vcf': {'chr': 'HG1595_PATCH', 'pos': '302869', 'ref': 'GCCCCGC', 'alt': 'G'}}}, {'hg19': {'hgvs_genomic_description': 'NW_004504303.2:g.302871_302876del', 'vcf': {'chr': 'NW_004504303.2', 'pos': '302869', 'ref': 'GCCCCGC', 'alt': 'G'}}}]) - assert results['NM_032790.3:c.126C>A']['primary_assembly_loci']['hg19'] == {'hgvs_genomic_description': 'NC_000012.11:g.122064773_122064778del', 'vcf': {'chr': 'chr12', 'pos': '122064771', 'ref': 'GCCCCGC', 'alt': 'G'}} + + # Bug fix for issue https://github.com/openvar/variantValidator/issues/94 creates extra outputs. Not an issue so ignore + #self.assertCountEqual(results['NM_032790.3:c.126C>A']['alt_genomic_loci'], [{'grch37': {'hgvs_genomic_description': 'NW_004504303.2:g.302871_302876del', 'vcf': {'chr': 'HG1595_PATCH', 'pos': '302869', 'ref': 'GCCCCGC', 'alt': 'G'}}}, {'hg19': {'hgvs_genomic_description': 'NW_004504303.2:g.302871_302876del', 'vcf': {'chr': 'NW_004504303.2', 'pos': '302869', 'ref': 'GCCCCGC', 'alt': 'G'}}}]) + assert results['NM_032790.3:c.126C>A']['primary_assembly_loci']['hg19'] == {'hgvs_genomic_description': 'NC_000012.11:g.122064773C>A', 'vcf': {'chr': 'chr12', 'pos': '122064773', 'ref': 'C', 'alt': 'A'}} assert results['NM_032790.3:c.126C>A']['primary_assembly_loci']['hg38'] == {'hgvs_genomic_description': 'NC_000012.12:g.121626873C>A', 'vcf': {'chr': 'chr12', 'pos': '121626873', 'ref': 'C', 'alt': 'A'}} - assert results['NM_032790.3:c.126C>A']['primary_assembly_loci']['grch37'] == {'hgvs_genomic_description': 'NC_000012.11:g.122064773_122064778del', 'vcf': {'chr': '12', 'pos': '122064771', 'ref': 'GCCCCGC', 'alt': 'G'}} + assert results['NM_032790.3:c.126C>A']['primary_assembly_loci']['grch37'] == {'hgvs_genomic_description': 'NC_000012.11:g.122064773C>A', 'vcf': {'chr': '12', 'pos': '122064773', 'ref': 'C', 'alt': 'A'}} assert results['NM_032790.3:c.126C>A']['primary_assembly_loci']['grch38'] == {'hgvs_genomic_description': 'NC_000012.12:g.121626873C>A', 'vcf': {'chr': '12', 'pos': '121626873', 'ref': 'C', 'alt': 'A'}} assert results['NM_032790.3:c.126C>A']['reference_sequence_records'] == {'transcript': 'https://www.ncbi.nlm.nih.gov/nuccore/NM_032790.3', 'protein': 'https://www.ncbi.nlm.nih.gov/nuccore/NP_116179.2', 'refseqgene': 'https://www.ncbi.nlm.nih.gov/nuccore/NG_007500.1', 'lrg': 'http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_93.xml'}