* When running differential expression or feature counting, RNAlysis …

…session reports will automatically include a logfile with R session info. * Added optional parameters to all differential expression functions, allowing users to return a path to a logfile with R session info.
GuyTeichman · Sep 16, 2024 · 4ce0e72 · 4ce0e72
1 parent 17f6e8f
commit 4ce0e72
Show file tree

Hide file tree

Showing 28 changed files with 286 additions and 62 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -15,6 +15,8 @@ Changed
 * Made small improvements to the RNAlysis graphical interface.
 * RNAlysis and its dependencies now run on Numpy 2 instead of Numpy 1.
 * RNAlysis now uses a different implementation of the K-Medoids clustering algorithm, which should be more stable and faster than the previous implementation. However, note that the two implementations may give slightly different results.
+* When running differential expression or feature counting, RNAlysis session reports will automatically include a logfile with R session info.
+* Added optional parameters to all differential expression functions, allowing users to return a path to a logfile with R session info.
 
 Fixed
 ******

diff --git a/rnalysis/data_files/r_templates/logging.R b/rnalysis/data_files/r_templates/logging.R
@@ -0,0 +1,5 @@
+# Open a connection to a log file
+logfile <- file("$LOGFILE", open = "a")
+# Redirect both output and messages to the file and console
+sink(logfile, append = TRUE, split = TRUE)
+
diff --git a/rnalysis/data_files/r_templates/sessioninfo_run.R b/rnalysis/data_files/r_templates/sessioninfo_run.R
@@ -0,0 +1,4 @@
+# get session info
+sessionInfo()
+# Close the sink to stop redirecting output
+sink()
diff --git a/rnalysis/fastq.py b/rnalysis/fastq.py
@@ -804,7 +804,9 @@ def featurecounts_single_end(input_folder: Union[str, Path], output_folder: Unio
                              ignore_secondary: bool = True,
                              count_fractionally: bool = False, is_long_read: bool = False,
                              report_read_assignment: Union[Literal['bam', 'sam', 'core'], None] = None,
-                             threads: PositiveInt = 1) -> Tuple[filtering.CountFilter, pl.DataFrame, pl.DataFrame]:
+                             threads: PositiveInt = 1, return_log: bool = False) -> Union[
+    Tuple[filtering.CountFilter, pl.DataFrame, pl.DataFrame],
+    Tuple[filtering.CountFilter, pl.DataFrame, pl.DataFrame, Path]]:
     """
     Assign mapped single-end sequencing reads to specified genomic features using \
     `RSubread featureCounts <https://doi.org/10.1093/bioinformatics/btt656>`_.
@@ -860,6 +862,9 @@ def featurecounts_single_end(input_folder: Union[str, Path], output_folder: Unio
     :type report_read_assignment: 'bam', 'sam', 'core', or None (default=None)
     :param threads: number of threads to run bowtie2-build on. More threads will generally make index building faster.
     :type threads: int > 0 (default=1)
+    :param return_log: if True, the function will return the path to the analysis logfile, \
+    which includes session info.
+    :type return_log: bool (default=False)
     :return: a count matrix (CountFilter) containing feature counts for all input files, \
     a DataFrame summarizing the features reads were aligned to, and a DataFrame summarizing the alignment statistics.
     :rtype: (filtering.CountFilter, pl.DataFrame, pl.DataFrame)
@@ -873,9 +878,9 @@ def featurecounts_single_end(input_folder: Union[str, Path], output_folder: Unio
 
     new_sample_names = _featurecounts_get_sample_names(kwargs['files'], new_sample_names)
 
-    feature_counting.run_featurecounts_analysis(kwargs, output_folder, r_installation_folder)
-    counts, annotation, stats = _process_featurecounts_output(output_folder, new_sample_names)
-    return counts, annotation, stats
+    feature_counting.FeatureCountsRunner(kwargs, output_folder, r_installation_folder).run()
+    counts, annotation, stats, log_path = _process_featurecounts_output(output_folder, new_sample_names)
+    return (counts, annotation, stats, log_path) if return_log else (counts, annotation, stats)
 
 
 @_func_type('paired')
@@ -891,7 +896,9 @@ def featurecounts_paired_end(input_folder: Union[str, Path], output_folder: Unio
                              count_chimeric_fragments: bool = False, min_fragment_length: NonNegativeInt = 50,
                              max_fragment_length: Union[PositiveInt, None] = 600,
                              report_read_assignment: Union[Literal['bam', 'sam', 'core'], None] = None,
-                             threads: PositiveInt = 1) -> Tuple[filtering.CountFilter, pl.DataFrame, pl.DataFrame]:
+                             threads: PositiveInt = 1, return_log: bool = False) -> Union[
+    Tuple[filtering.CountFilter, pl.DataFrame, pl.DataFrame],
+    Tuple[filtering.CountFilter, pl.DataFrame, pl.DataFrame, Path]]:
     """
     Assign mapped paired-end sequencing reads to specified genomic features using \
     `RSubread featureCounts <https://doi.org/10.1093/bioinformatics/btt656>`_. \
@@ -961,6 +968,9 @@ def featurecounts_paired_end(input_folder: Union[str, Path], output_folder: Unio
     :type max_fragment_length: int > 0 or None (default=600)
     :param threads: number of threads to run bowtie2-build on. More threads will generally make index building faster.
     :type threads: int > 0 (default=1)
+    :param return_log: if True, the function will return the path to the analysis logfile, \
+    which includes session info.
+    :type return_log: bool (default=False)
     :return: a count matrix (CountFilter) containing feature counts for all input files, \
     a DataFrame summarizing the features reads were aligned to, and a DataFrame summarizing the alignment statistics.
     :rtype: (filtering.CountFilter, pl.DataFrame, pl.DataFrame)
@@ -978,10 +988,9 @@ def featurecounts_paired_end(input_folder: Union[str, Path], output_folder: Unio
     kwargs.update(paired_kwargs)
 
     new_sample_names = _featurecounts_get_sample_names(kwargs['files'], new_sample_names)
-
-    feature_counting.run_featurecounts_analysis(kwargs, output_folder, r_installation_folder)
-    counts, annotation, stats = _process_featurecounts_output(output_folder, new_sample_names)
-    return counts, annotation, stats
+    feature_counting.FeatureCountsRunner(kwargs, output_folder, r_installation_folder).run()
+    counts, annotation, stats, log_path = _process_featurecounts_output(output_folder, new_sample_names)
+    return (counts, annotation, stats, log_path) if return_log else (counts, annotation, stats)
 
 
 def _parse_featurecounts_misc_args(input_folder: Union[str, Path], output_folder: Path, gtf_file: Union[str, Path],
@@ -1034,6 +1043,7 @@ def _process_featurecounts_output(output_folder, new_sample_names):
     counts_path = Path(output_folder).joinpath('featureCounts_counts.csv')
     annotation_path = Path(output_folder).joinpath('featureCounts_annotation.csv')
     stats_path = Path(output_folder).joinpath('featureCounts_stats.csv')
+    log_path = Path(output_folder).joinpath('logfile.log')
 
     counts = filtering.CountFilter(counts_path)
     counts.df = counts.df.rename(
@@ -1047,7 +1057,7 @@ def _process_featurecounts_output(output_folder, new_sample_names):
     stats = stats.rename({oldname: newname for oldname, newname in zip(stats.columns[1:], new_sample_names)})
     io.save_table(stats, stats_path)  # re-save to reflect changes in column names
 
-    return counts, annotation, stats
+    return counts, annotation, stats, log_path
 
 
 @readable_name('Bowtie2 build index')

diff --git a/rnalysis/filtering.py b/rnalysis/filtering.py
@@ -3046,8 +3046,8 @@ def differential_expression_limma_voom(self, design_matrix: Union[str, Path],
                                            r_installation_folder: Union[str, Path, Literal['auto']] = 'auto',
                                            output_folder: Union[str, Path, None] = None,
                                            random_effect: Union[str, None] = None, quality_weights: bool = False,
-                                           return_design_matrix: bool = False, return_code: bool = False) -> \
-        Tuple['DESeqFilter', ...]:
+                                           return_design_matrix: bool = False, return_code: bool = False,
+                                           return_log: bool = False) -> Tuple['DESeqFilter', ...]:
         """
         Run differential expression analysis on the count matrix using the \
         `Limma-Voom <https://doi.org/10.1186/gb-2014-15-2-r29>`_ pipeline. \
@@ -3095,8 +3095,12 @@ def differential_expression_limma_voom(self, design_matrix: Union[str, Path],
         :type quality_weights: bool (default=False)
         :param return_design_matrix: if True, the function will return the sanitized design matrix used in the analysis.
         :type return_design_matrix: bool (default=False)
-        :param return_code: if True, the function will return the R script used to generate the analysis results.
+        :param return_code: if True, the function will return the path to the R script \
+        used to generate the analysis results.
         :type return_code: bool (default=False)
+        :param return_log: if True, the function will return the path to the analysis logfile, \
+        which includes session info.
+        :type return_log: bool (default=False)
         :return: a tuple of DESeqFilter objects, one for each comparison
         """
         if output_folder is not None:
@@ -3127,6 +3131,7 @@ def differential_expression_limma_voom(self, design_matrix: Union[str, Path],
                                                                lrt_factors, model_factors, r_installation_folder,
                                                                random_effect, quality_weights).run()
         code_path = None
+        log_path = None
         outputs = []
         for item in r_output_dir.iterdir():
             if not item.is_file():
@@ -3135,6 +3140,8 @@ def differential_expression_limma_voom(self, design_matrix: Union[str, Path],
                 outputs.append(DESeqFilter(item, log2fc_col='logFC', padj_col='adj.P.Val', pval_col='P.Value'))
             elif item.suffix == '.R':
                 code_path = item
+            elif item.suffix == '.log':
+                log_path = item
             if output_folder is not None:
                 with open(item) as infile, open(output_folder.joinpath(item.name), 'w') as outfile:
                     outfile.write(infile.read())
@@ -3146,6 +3153,11 @@ def differential_expression_limma_voom(self, design_matrix: Union[str, Path],
                 warnings.warn("No R script was generated during the analysis")
             else:
                 return_val = [return_val, code_path]
+        if return_log:
+            if log_path is None:
+                warnings.warn("No log file was generated during the analysis")
+            else:
+                return_val = [return_val, log_path]
         return return_val
 
     @readable_name('Run Limma-Voom differential expression (simplified mode)')
@@ -3155,8 +3167,8 @@ def differential_expression_limma_voom_simplified(self, design_matrix: Union[str
                                                       output_folder: Union[str, Path, None] = None,
                                                       random_effect: Union[str, None] = None,
                                                       quality_weights: bool = False,
-                                                      return_design_matrix: bool = False, return_code: bool = False
-                                                      ) -> Tuple['DESeqFilter', ...]:
+                                                      return_design_matrix: bool = False, return_code: bool = False,
+                                                      return_log: bool = False) -> Tuple['DESeqFilter', ...]:
         """
        Run differential expression analysis on the count matrix using the \
        `Limma-Voom <https://doi.org/10.1186/gb-2014-15-2-r29>`_ pipeline. \
@@ -3204,7 +3216,7 @@ def differential_expression_limma_voom_simplified(self, design_matrix: Union[str
                                                        output_folder=output_folder, random_effect=random_effect,
                                                        quality_weights=quality_weights,
                                                        return_design_matrix=return_design_matrix,
-                                                       return_code=return_code)
+                                                       return_code=return_code, return_log=return_log)
 
     @readable_name('Run DESeq2 differential expression')
     def differential_expression_deseq2(self, design_matrix: Union[str, Path],
@@ -3216,7 +3228,8 @@ def differential_expression_deseq2(self, design_matrix: Union[str, Path],
                                        output_folder: Union[str, Path, None] = None, return_design_matrix: bool = False,
                                        scaling_factors: Union[str, Path, None] = None,
                                        cooks_cutoff: bool = True,
-                                       return_code: bool = False) -> Tuple['DESeqFilter', ...]:
+                                       return_code: bool = False, return_log: bool = False
+                                       ) -> Tuple['DESeqFilter', ...]:
         """
         Run differential expression analysis on the count matrix using the \
         `DESeq2 <https://doi.org/10.1186/s13059-014-0550-8>`_ algorithm. \
@@ -3259,8 +3272,12 @@ def differential_expression_deseq2(self, design_matrix: Union[str, Path],
         :type output_folder: str, Path, or None
         :param return_design_matrix: if True, the function will return the sanitized design matrix used in the analysis.
         :type return_design_matrix: bool (default=False)
-        :param return_code: if True, the function will return the R script used to generate the analysis results.
+        :param return_code: if True, the function will return the path to the R script \
+        used to generate the analysis results.
         :type return_code: bool (default=False)
+        :param return_log: if True, the function will return the path to the analysis logfile, \
+        which includes session info.
+        :type return_log: bool (default=False)
         :return: a tuple of DESeqFilter objects, one for each comparison
         """
         if output_folder is not None:
@@ -3319,13 +3336,16 @@ def differential_expression_deseq2(self, design_matrix: Union[str, Path],
                                                            scale_factor_path, cooks_cutoff, scale_factor_ndims).run()
         outputs = []
         code_path = None
+        log_path = None
         for item in r_output_dir.iterdir():
             if not item.is_file():
                 continue
             if item.suffix == '.csv':
                 outputs.append(DESeqFilter(item))
             if item.suffix == '.R':
                 code_path = item
+            elif item.suffix == '.log':
+                log_path = item
             if output_folder is not None:
                 with open(item) as infile, open(output_folder.joinpath(item.name), 'w') as outfile:
                     outfile.write(infile.read())
@@ -3337,15 +3357,20 @@ def differential_expression_deseq2(self, design_matrix: Union[str, Path],
                 warnings.warn("No R script was generated during the analysis")
             else:
                 return_val = [return_val, code_path]
+        if return_log:
+            if log_path is None:
+                warnings.warn("No log file was generated during the analysis")
+            else:
+                return_val = [return_val, log_path]
         return return_val
 
     @readable_name('Run DESeq2 differential expression (simplified mode)')
     def differential_expression_deseq2_simplified(self, design_matrix: Union[str, Path],
                                                   comparisons: Iterable[Tuple[str, str, str]],
                                                   r_installation_folder: Union[str, Path, Literal['auto']] = 'auto',
                                                   output_folder: Union[str, Path, None] = None,
-                                                  return_design_matrix: bool = False, return_code: bool = False
-                                                  ) -> Tuple['DESeqFilter', ...]:
+                                                  return_design_matrix: bool = False, return_code: bool = False,
+                                                  return_log: bool = False) -> Tuple['DESeqFilter', ...]:
         """
         Run differential expression analysis on the count matrix using the \
         `DESeq2 <https://doi.org/10.1186/s13059-014-0550-8>`_ algorithm. \
@@ -3385,7 +3410,8 @@ def differential_expression_deseq2_simplified(self, design_matrix: Union[str, Pa
         return self.differential_expression_deseq2(design_matrix, comparisons,
                                                    r_installation_folder=r_installation_folder,
                                                    output_folder=output_folder,
-                                                   return_design_matrix=return_design_matrix, return_code=return_code)
+                                                   return_design_matrix=return_design_matrix, return_code=return_code,
+                                                   return_log=return_log)
 
     @readable_name('Calculate fold change')
     def fold_change(self, numerator: param_typing.ColumnNames, denominator: param_typing.ColumnNames,

diff --git a/rnalysis/gui/gui.py b/rnalysis/gui/gui.py
@@ -329,7 +329,7 @@ def __init__(self, parent=None):
 
 class DiffExpWindow(gui_windows.FuncExternalWindow):
     EXCLUDED_PARAMS = {'self', 'comparisons', 'covariates', 'lrt_factors', 'model_factors', 'return_design_matrix',
-                       'return_code'}
+                       'return_code', 'return_log'}
     IGNORED_WIDGETS = gui_windows.FuncExternalWindow.IGNORED_WIDGETS | {'load_design'}
 
     __slots__ = {'comparisons': 'list of comparisons to make',
@@ -438,6 +438,7 @@ def get_analysis_kwargs(self):
         kwargs = super().get_analysis_kwargs()
         kwargs['return_design_matrix'] = True
         kwargs['return_code'] = True
+        kwargs['return_log'] = True
 
         kwargs['comparisons'] = self.comparisons_widgets['picker'].get_comparison_values()
         if not self.simplified: