Source code for src.utils.reads_merger

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
r"""FAST(A|Q) Reads Merger.

    This script consolidates paired-end FASTQ file
    from a specified directory by merging multiple files per sample
    if necessary, or copying single files directly.
    It identifies samples based on a provided pattern,
    and processes R1 and R2 read files separately.

    Usage:
        python script_name.py -i <input> -o <output> \
            -id <id_regex> -r1 <r1_pattern> -r2 <r2_pattern>

    Options:
        --path        Input directory containing FASTQ files.
        --outpath     Output directory for merged FASTQ files.
        --id_pattern  Regex pattern to extract sample IDs from filenames.
        --r1_pattern  Regex pattern for R1 read files.
        --r2_pattern  Regex pattern for R2 read files.

    Example:
        python3.13 read_merger.py \
        --path <workdir>/input_dir \
        --outpath <workdir>/output_dir \
        --id_pattern '<sample_base>_[\d]{4}_([^_]*){1,2}' \
        --r1_pattern '.*R1.*\.fastq\.gz' \
        --r2_pattern '.*R2.*\.fastq\.gz'
"""

import os
import sys
import re

from os import PathLike
from typing import AnyStr

import argparse


[docs] def parse_args() -> argparse.Namespace: """Parses command-line arguments for input and output paths, and patterns. Returns: args: Parsed arguments namespace. """ parser = argparse.ArgumentParser( prog="FAST(A|Q) Reads Merger", description="Merges paired-end FASTQ files from a specified directory." ) arguments = [ { 'name': ('--path', '-i'), 'kwargs': { 'dest': 'path', 'type': str, 'required': True, 'help': 'Input directory containing FASTQ files' } }, { 'name': ('--outpath', '-o'), 'kwargs': { 'dest': 'outpath', 'type': str, 'required': True, 'help': 'Output directory for merged files' } }, { 'name': ('--id_pattern', '-id'), 'kwargs': { 'dest': 'id_pattern', 'type': str, 'required': True, 'help': 'Regex pattern to identify sample IDs' } }, { 'name': ('--r1_pattern', '-r1'), 'kwargs': { 'dest': 'r1_pattern', 'type': str, 'required': True, 'help': 'Pattern for r1 reads' } }, { 'name': ('--r2_pattern', '-r2'), 'kwargs': { 'dest': 'r2_pattern', 'type': str, 'required': True, 'help': 'Pattern for r2 reads' } } ] for arg in arguments: parser.add_argument(*arg['name'], **arg['kwargs']) return parser.parse_args()
[docs] def merge_fastq( path: PathLike[AnyStr], outpath: PathLike[AnyStr], id_pattern: str = r"(?:russco_[\d]{4}_(?:ffpe_cr|leu))", r1_pattern: str = r"[^\s]*R1[^\s]*(?:\.fa(?:st(?:a|q)))(?:\.(?:gz|bz|bgz))?", r2_pattern: str = r"[^\s]*R2[^\s]*(?:\.fa(?:st(?:a|q)))(?:\.(?:gz|bz|bgz))?" ): """Merges R1 and R2 FASTQ files per sample based on provided patterns.""" error_msg = "Wrong regexp pattern was given or there are no any " \ "coincided with the pattern files in input directory.\n" \ f"Input directory: {path}\nOutput directory: {outpath}" \ f"ID pattern: {id_pattern}\n" \ f"R1 pattern: {r1_pattern}\n" \ f"R2 pattern: {r2_pattern}" dir_content = ' '.join(os.listdir(path)) samples = set(re.findall(id_pattern, dir_content)) try: for sample in samples: r1_cursor = re.compile(sample+r1_pattern) r2_cursor = re.compile(sample+r2_pattern) for cursor in [r1_cursor, r2_cursor]: files = [ f"{sample}{file}" for file in cursor.findall(dir_content) ] if len(files) > 1: cat_list = [ f"{sample}_{file[file.index('R'):]}" for file in files ] cmd = ' '.join([ 'cat', ' '.join([os.path.join( path, file) for file in files]), '>', os.path.abspath(os.path.join( outpath, f"{sample}_{'R1' if 'R1' in cat_list[0] else 'R2'}" ".fastq.gz" )) ]) os.system(cmd) elif len(files) == 1: cmd = ' '.join([ 'cp', os.path.join(path, files[0]), outpath]) os.system(cmd) except re.PatternError: print(error_msg) sys.exit(os.EX_USAGE)
if __name__ == '__main__': args = parse_args() if not os.path.exists(args.outpath): os.makedirs(os.path.abspath(args.outpath)) merge_fastq( args.path, args.outpath, args.id_pattern, args.r1_pattern, args.r2_pattern )