From 58d56f1c476ea5d222af6b26ae663bed16f51433 Mon Sep 17 00:00:00 2001 From: Adam Saleh Date: Sep 02 2021 11:09:17 +0000 Subject: Filter log data to speed up processing Signed-off-by: Adam Saleh --- diff --git a/countme/parse.py b/countme/parse.py index 4556d26..b30e697 100644 --- a/countme/parse.py +++ b/countme/parse.py @@ -1,11 +1,34 @@ +from contextlib import contextmanager +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import Iterator, Union + from countme.progress import ReadProgress +@contextmanager +def pre_process(filepath: Union[str, Path]) -> Iterator[str]: + filepath = Path(filepath) + with NamedTemporaryFile( + prefix=f"mirrors-countme-{filepath.name}-", + suffix=".preprocessed", + ) as tmpfile: + import subprocess + + print(f"Preprocessing file: {filepath}") + cmd = ["grep", "countme", str(filepath)] + r = subprocess.run(cmd, stdout=tmpfile) + if r.returncode != 0: + print(f"Preprocessing file failed, returning original: {filepath}") + yield str(filepath) + yield tmpfile.name + + def parse(args=None): if args.header or args.sqlite: args.writer.write_header() - for logf in ReadProgress(args.logs, display=args.progress): + for logf in ReadProgress(args.logs, display=args.progress, pre_process=pre_process): # Make an iterator object for the matching log lines match_iter = iter(args.matcher(logf)) diff --git a/countme/progress.py b/countme/progress.py index 293ea50..71fd4c0 100644 --- a/countme/progress.py +++ b/countme/progress.py @@ -19,6 +19,10 @@ import os import sys +from contextlib import contextmanager +from pathlib import Path +from typing import Iterator, Union + from .regex import LOG_DATE_RE __all__ = ( @@ -84,20 +88,27 @@ def log_total_size(logfn): return os.stat(logfn).st_size +@contextmanager +def no_preprocess(filepath: Union[str, Path]) -> Iterator[str]: + yield str(filepath) + + class ReadProgressBase: - def __init__(self, logs, display=True): + def __init__(self, logs, display=True, pre_process=no_preprocess): """logs should be a sequence of line-iterable file-like objects. if display is False, no progress output will be printed.""" self.logs = logs self.display = display + self.pre_process = pre_process def __iter__(self): """Iterator for ReadProgress; yields a sequence of line-iterable file-like objects (one for each log in logs).""" for num, logfn in enumerate(self.logs): - logf = log_reader(logfn) - total = log_total_size(logfn) - yield self._iter_log_lines(logf, num, total) + with self.pre_process(logfn) as processed_log: + logf = log_reader(processed_log) + total = log_total_size(processed_log) + yield self._iter_log_lines(logf, num, total) def _iter_log_lines(self, logf, num, total): # Make a progress meter for this file