"""This module defines an interface for a sample data factory
and a base class for sample data containers.
It provides a way to parse sample data from various sources,
storing the information in a structured manner.
The `ISampleDataFactory` protocol defines the required method
for parsing sample data, and `SampleDataContainer` provides a common
structure for storing sample data paths and identifiers.
The module utilizes the `logging` module for logging operations.
"""
# region Imports
import logging
import os
import re
from pathlib import Path
from os import PathLike
from typing import AnyStr
from typing import Protocol
from src.core.base import LoggerMixin, get_unique_path
from src.core.sample_data_container import SampleDataContainer
# endregion
[docs]
class ISampleDataFactory(Protocol):
"""Interface for a sample data factory.
Defines the method parse_sample_data
to be implemented by concrete classes.
"""
[docs]
def parse_sample_data(
self,
path: PathLike[AnyStr],
sample_id: AnyStr
) -> SampleDataContainer:
"""Parses sample data from the given path
for the specified sample ID.
Args:
path (PathLike[AnyStr]):
Directory path containing sample files.
sample_id (AnyStr):
Identifier for the sample.
Returns:
SampleDataContainer:
An instance containing parsed sample data.
"""
[docs]
class SampleDataFactory(LoggerMixin, ISampleDataFactory):
"""Concrete implementation of the ISampleDataFactory interface.
Uses logging for error reporting and parsing sample data from files.
"""
def __init__(
self,
logger: logging.Logger = None,
outpath: PathLike[AnyStr] = None,
):
"""Initializes the factory with an optional custom logger.
Args:
logger (logging.Logger, optional):
Logger instance. Defaults to None.
"""
if outpath is not None:
self.outpath = Path(outpath)
else:
self.outpath = get_unique_path()
super().__init__(logger=logger)
[docs]
def parse_sample_data(
self, path:
PathLike[AnyStr],
sample_id: AnyStr
) -> SampleDataContainer:
"""Parses sample data files from a directory based on the sample ID.
Looks for files containing
the sample ID and 'R1' or 'R2' in their names.
Args:
path (PathLike[AnyStr]):
Directory path containing sample files.
sample_id (AnyStr):
Identifier for the sample.
Returns:
SampleDataContainer:
An instance with source paths for R1 and R2,
or None if files are not found.
"""
regexp_filter = re.compile(rf"^.*{sample_id}.*")
sample_reads_source_pathes = filter(
regexp_filter.match, os.listdir(path))
sample_r1_path, sample_r2_path = None, None
for read in sample_reads_source_pathes:
if 'R1' in read:
sample_r1_path = read
continue
if 'R2' in read:
sample_r2_path = read
continue
if sample_r1_path is not None and sample_r2_path is not None:
processing_path = os.path.abspath(os.path.join(
self.outpath, sample_id
))
processing_logpath = os.path.join(processing_path, "log")
report_path = os.path.join(processing_path, "report")
sample_data = SampleDataContainer(
r1_source=os.path.join(path, sample_r1_path),
r2_source=os.path.join(path, sample_r2_path),
sid=sample_id,
processing_path=processing_path,
processing_logpath=processing_logpath,
report_path=report_path
)
return sample_data
else:
self.logger.critical(
"Can't find '%s' file for sample '%s'",
'R1' if sample_r1_path is None else 'R2',
sample_id.strip()
)
return None