Source code for src.utils.reads_merger

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
r"""FAST(A|Q) Reads Merger.

    This script consolidates paired-end FASTQ file
    from a specified directory by merging multiple files per sample
    if necessary, or copying single files directly.
    It identifies samples based on a provided pattern,
    and processes R1 and R2 read files separately.

    Usage:
        python script_name.py -i <input> -o <output> \
            -id <id_regex> -r1 <r1_pattern> -r2 <r2_pattern>

    Options:
        --path        Input directory containing FASTQ files.
        --outpath     Output directory for merged FASTQ files.
        --id_pattern  Regex pattern to extract sample IDs from filenames.
        --r1_pattern  Regex pattern for R1 read files.
        --r2_pattern  Regex pattern for R2 read files.

    Example:
        python3.13 read_merger.py \
        --path <workdir>/input_dir \
        --outpath <workdir>/output_dir \
        --id_pattern '<sample_base>_[\d]{4}_([^_]*){1,2}' \
        --r1_pattern '.*R1.*\.fastq\.gz' \
        --r2_pattern '.*R2.*\.fastq\.gz'
"""

import os
import sys
import re

from os import PathLike
from typing import AnyStr

import argparse



[docs]
def parse_args() -> argparse.Namespace:
    """Parses command-line arguments for input and output paths, and patterns.

        Returns:
            args:
                Parsed arguments namespace.
    """
    parser = argparse.ArgumentParser(
        prog="FAST(A|Q) Reads Merger",
        description="Merges paired-end FASTQ files from a specified directory."
    )

    arguments = [
        {
            'name': ('--path', '-i'),
            'kwargs': {
                'dest': 'path',
                'type': str,
                'required': True,
                'help': 'Input directory containing FASTQ files'
            }
        },
        {
            'name': ('--outpath', '-o'),
            'kwargs': {
                'dest': 'outpath',
                'type': str,
                'required': True,
                'help': 'Output directory for merged files'
            }
        },
        {
            'name': ('--id_pattern', '-id'),
            'kwargs': {
                'dest': 'id_pattern',
                'type': str,
                'required': True,
                'help': 'Regex pattern to identify sample IDs'
            }
        },
        {
            'name': ('--r1_pattern', '-r1'),
            'kwargs': {
                'dest': 'r1_pattern',
                'type': str,
                'required': True,
                'help': 'Pattern for r1 reads'
            }
        },
        {
            'name': ('--r2_pattern', '-r2'),
            'kwargs': {
                'dest': 'r2_pattern',
                'type': str,
                'required': True,
                'help': 'Pattern for r2 reads'
            }
        }
    ]

    for arg in arguments:
        parser.add_argument(*arg['name'], **arg['kwargs'])
    return parser.parse_args()




[docs]
def merge_fastq(
    path: PathLike[AnyStr],
    outpath: PathLike[AnyStr],
    id_pattern: str = r"(?:russco_[\d]{4}_(?:ffpe_cr|leu))",
    r1_pattern:
        str = r"[^\s]*R1[^\s]*(?:\.fa(?:st(?:a|q)))(?:\.(?:gz|bz|bgz))?",
    r2_pattern:
        str = r"[^\s]*R2[^\s]*(?:\.fa(?:st(?:a|q)))(?:\.(?:gz|bz|bgz))?"
):
    """Merges R1 and R2 FASTQ files per sample based on provided patterns."""
    error_msg = "Wrong regexp pattern was given or there are no any " \
        "coincided with the pattern files in input directory.\n" \
        f"Input directory: {path}\nOutput directory: {outpath}" \
        f"ID pattern: {id_pattern}\n" \
        f"R1 pattern: {r1_pattern}\n" \
        f"R2 pattern: {r2_pattern}"

    dir_content = ' '.join(os.listdir(path))

    samples = set(re.findall(id_pattern, dir_content))

    try:
        for sample in samples:
            r1_cursor = re.compile(sample+r1_pattern)
            r2_cursor = re.compile(sample+r2_pattern)

            for cursor in [r1_cursor, r2_cursor]:
                files = [
                    f"{sample}{file}" for file in cursor.findall(dir_content)
                ]

                if len(files) > 1:
                    cat_list = [
                        f"{sample}_{file[file.index('R'):]}" for file in files
                    ]

                    cmd = ' '.join([
                        'cat', ' '.join([os.path.join(
                            path, file) for file in files]),
                        '>',
                        os.path.abspath(os.path.join(
                            outpath,
                            f"{sample}_{'R1' if 'R1' in cat_list[0] else 'R2'}"
                            ".fastq.gz"
                        ))
                    ])

                    os.system(cmd)

                elif len(files) == 1:
                    cmd = ' '.join([
                        'cp', os.path.join(path, files[0]), outpath])
                    os.system(cmd)

    except re.PatternError:
        print(error_msg)
        sys.exit(os.EX_USAGE)



if __name__ == '__main__':
    args = parse_args()

    if not os.path.exists(args.outpath):
        os.makedirs(os.path.abspath(args.outpath))

    merge_fastq(
        args.path,
        args.outpath,
        args.id_pattern,
        args.r1_pattern,
        args.r2_pattern
    )