Source code for pyline.pyline

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
"""

**pyline**

A simple UNIX tool for line-based processing in Python.

Features:

* Python str.split by a delimiter (``-F``)
* Python Regex (``-r``, ``--regex``, ``-R``, ``--regex-options``)
* Output as txt, csv, ttsv, json (``-O``, ``-output-filetype``)
* (Lazy) sorting (``-s``, ``--sort-asc``, ``-S``, ``--sort-desc``)
* Create Path.py (or pathlib) objects from each line (``-p``)
* namedtuples, ``yield``ing generators

**Usage**

Shell::

    pyline.py --help
    pyline.py --test

    # Print every line (null transform)
    cat ~/.bashrc | pyline.py line
    cat ~/.bashrc | pyline.py l

    # Number every line
    cat ~/.bashrc | pyline -n l

    # Print every word (str.split(--input-delim=None))
    cat ~/.bashrc | pyline.py words
    cat ~/.bashrc | pyline.py w

    # Split into words and print (default: tab separated)
    cat ~/.bashrc | pyline.py 'len(w) >= 2 and w[1] or "?"

    # Select the last word, dropping lines with no words
    pyline.py -f ~/.bashrc 'w[-1:]'

    # Regex matching with groups
    cat ~/.bashrc | pyline.py -n -r '^#(.*)' 'rgx and rgx.group()'


"""

import csv
import json
import logging
import operator
import textwrap

from collections import namedtuple

EPILOG = __doc__  # """  """

REGEX_DOC = """I  IGNORECASE  Perform case-insensitive matching.
L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale.
M  MULTILINE   "^" matches the beginning of lines (after a newline)
                as well as the string.
                "$" matches the end of lines (before a newline) as well
                as the end of the string.
S  DOTALL      "." matches any character at all, including the newline.
X  VERBOSE     Ignore whitespace and comments for nicer looking RE's.
U  UNICODE     Make \w, \W, \b, \B, dependent on the Unicode locale."""
REGEX_OPTIONS = dict(
    (l[0],
        (l[1:14].strip(), l[15:]))
    for l in REGEX_DOC.split('\n'))

STANDARD_REGEXES = {}

log = logging.getLogger()
log.setLevel(logging.INFO)


[docs]class NullHandler(logging.Handler):
[docs]    def emit(self, record):
        pass


h = NullHandler()
log.addHandler(h)

Result = namedtuple('Result', ('n', 'result'))


[docs]class PylineResult(Result):

    def __str__(self):
        result = self.result
        odelim = u'\t'  # TODO
        if result is None or result is False:
            return result

        elif hasattr(self.result, 'itervalues'):
            for col in self.result.itervalues():
                return odelim.join(str(s) for s in self.result.itervalues())

        elif hasattr(self.result, '__iter__'):
            result = odelim.join(str(s) for s in result)
        else:
            if result[-1] == '\n':
                result = result[:-1]
        return result

    def _numbered(self, **opts):
        yield self.n
        if self.result is None or self.result is False:
            yield self.result

        elif hasattr(self.result, 'itervalues'):
            for col in self.result.itervalues():
                yield col

        elif hasattr(self.result, '__iter__'):
            for col in self.result:
                yield col

        elif hasattr(self.result, 'rstrip'):
            yield self.result.rstrip()

    def _numbered_str(self, odelim):
        record = self._numbered()
        return ' %4d%s%s' % (
            record.next(),
            odelim,
            unicode(odelim).join(str(x) for x in record))


def _import_path_module():
    Path = None
    try:
        from path import path as Path
    except ImportError:
        try:
            from pathlib import Path
            pass
        except ImportError:
            log.error("pip install pathlib (or path.py)")
            Path = str  # os.exists, os
            pass
    return Path

Path = _import_path_module()


[docs]def get_path_module():
    return Path


[docs]def pyline(iterable,
           cmd=None,
           modules=[],
           regex=None,
           regex_options=None,
           path_tools=False,
           idelim=None,
           odelim="\t",
           **kwargs):
    """
    Pyline: process an iterable

    Args:
        iterable (iterable): iterable of strings (e.g. sys.stdin or a file)
        cmd (str): python command string
        modules ([str]): list of modules to import
        regex (str): regex pattern to match (with groups)
        regex_options (TODO): Regex options: I L M S X U (see ``pydoc re``)
        path_tools (bool): try to cast each line to a file
        idelim (str): input delimiter
        odelim (str): output delimiter

    Returns:
        iterable of PylineResult namedtuples
    """

    for _importset in modules:
        for _import in _importset.split(','):
            locals()[_import] = __import__(_import.strip())

    _rgx = None
    if regex:
        import re
        _regexstr = regex
        if bool(regex_options):
            _regexstr = ("(?%s)" % (regex_options)) + _regexstr
        #    _regexstr = r"""(?%s)%s""" % (
        #        ''.join(
        #            l.lower() for l in regex_options
        #                if l.lower() in REGEX_OPTIONS),
        #        _regexstr)
        log.debug("_rgx = %r" % _regexstr)
        _rgx = re.compile(_regexstr)

    if cmd is None:
        if regex:
            cmd = "rgx and rgx.groups()"
            # cmd = "rgx and rgx.groupdict()"
        else:
            cmd = "line"
        if path_tools:
            cmd = "p"

    Path = None
    if path_tools:
        Path = get_path_module()

    try:
        log.info("_cmd: %r" % cmd)
        codeobj = compile(cmd, 'command', 'eval')
    except Exception as e:
        e.message = "%s\ncmd: %s" % (e.message, cmd)
        log.error(repr(cmd))
        log.exception(e)
        raise

    def item_keys(obj, keys):
        if isinstance(keys, (str, unicode)):
            keys = [keys]
        for k in keys:
            if k is None:
                yield k
            else:
                yield obj.__getslice__(k)

    k = lambda obj, keys=(':',): [obj.__getslice__(k) for k in keys]
    j = lambda args: odelim.join(str(_value) for _value in args)
    # from itertools import imap, repeat
    # j = lambda args: imap(str, izip_longest(args, repeat(odelim)))

    for i, line in enumerate(iterable):
        l = line
        w = words = [w for w in line.strip().split(idelim)]

        p = path = None
        if path_tools and line.rstrip():
            try:
                p = path = Path(line.strip()) or None
            except Exception as e:
                log.exception(e)
                pass

        rgx = _rgx and _rgx.match(line) or None

        # Note: eval
        try:
            result = eval(codeobj, globals(), locals())  # ...
        except Exception as e:
            e.cmd = cmd
            log.exception(repr(cmd))
            log.exception(e)
            raise
        yield PylineResult(i, result)


[docs]def itemgetter_default(args, default=None):
    """
    Return a callable object that fetches the given item(s) from its operand,
    or the specified default value.

    Similar to operator.itemgetter except returns ``default``
    when the index does not exist
    """
    if args is None:
        columns = xrange(len(args))
    else:
        columns = args

    def _itemgetter(row):
        for col in columns:
            try:
                yield row[col]
            except IndexError:
                yield default
    return _itemgetter


[docs]def get_list_from_str(str_, cast_callable=int):
    if not str_ or not str_.strip():
        return []
    return [cast_callable(x.strip()) for x in str_.split(',')]


[docs]def sort_by(sortstr, nl, reverse=False):
    columns = get_list_from_str(sortstr)
    log.debug("columns: %r" % columns)

    get_columns = operator.itemgetter(*columns)

    get_columns = itemgetter_default(columns, default=None)

    return sorted(nl,
                  key=get_columns,
                  reverse=reverse)


[docs]class ResultWriter(object):
    OUTPUT_FILETYPES = {
        'csv': ",",
        'json': True,
        'tsv': "\t",
        'html': True,
        "txt": True,
        "checkbox": True
    }
    filetype = None

    def __init__(self, _output, *args, **kwargs):
        self._output = _output
        self._conf = kwargs
        self.setup(_output, *args, **kwargs)

[docs]    def setup(self, *args, **kwargs):
        pass

[docs]    def set_output(self, _output):
        if _output and self._output is not None:
            raise Exception()
        else:
            self._output = _output

[docs]    def header(self, *args, **kwargs):
        pass

[docs]    def write(self, obj):
        print(obj, file=self._output)

[docs]    def write_numbered(self, obj):
        print(obj, file=self._output)

[docs]    def footer(self, *args, **kwargs):
        pass

    @classmethod
[docs]    def get_writer(cls, _output,
                   filetype="csv",
                   **kwargs):
        """get writer object for _output with the specified filetype

        :param output_filetype: txt | csv | tsv | json | html | checkbox
        :param _output: output file

        """
        output_filetype = filetype.strip().lower()

        if output_filetype not in ResultWriter.OUTPUT_FILETYPES:
            raise Exception()

        writer = None
        if output_filetype == "txt":
            writer = ResultWriter_txt(_output)
        elif output_filetype == "csv":
            writer = ResultWriter_csv(_output, **kwargs)
        elif output_filetype == "tsv":
            writer = ResultWriter_csv(_output, delimiter='\t', **kwargs)
        elif output_filetype == "json":
            writer = ResultWriter_json(_output)
        elif output_filetype == "html":
            writer = ResultWriter_html(_output, **kwargs)
        elif output_filetype == "checkbox":
            writer = ResultWriter_checkbox(_output, **kwargs)
        else:
            raise NotImplementedError()
        return (
            writer,
            (kwargs.get('number_lines')
                and writer.write_numbered or writer.write))


[docs]class ResultWriter_txt(ResultWriter):
    filetype = 'txt'

[docs]    def write_numbered(self, obj):
        self.write(obj._numbered_str(odelim='\t'))


[docs]class ResultWriter_csv(ResultWriter):
    filetype = 'csv'

[docs]    def setup(self, *args, **kwargs):
        self.delimiter = kwargs.get(
            'delimiter',
            ResultWriter.OUTPUT_FILETYPES.get(
                self.filetype,
                ','))
        self._output_csv = csv.writer(self._output,
                                      quoting=csv.QUOTE_NONNUMERIC,
                                      delimiter=self.delimiter)
        #                             doublequote=True)

[docs]    def header(self, *args, **kwargs):
        attrs = kwargs.get('attrs', PylineResult._fields)
        self._output_csv.writerow(attrs)

[docs]    def write(self, obj):
        self._output_csv.writerow(obj.result)

[docs]    def write_numbered(self, obj):
        self._output_csv.writerow(tuple(obj._numbered()))


[docs]class ResultWriter_json(ResultWriter):
    filetype = 'json'

[docs]    def write(self, obj):
        print(
            json.dumps(
                obj._asdict(),
                indent=2),
            end=',\n',
            file=self._output)

    write_numbered = write


[docs]class ResultWriter_html(ResultWriter):
    filetype = 'html'

[docs]    def header(self, *args, **kwargs):
        attrs = kwargs.get('attrs')
        self._output.write("<table>")
        self._output.write("<tr>")
        if bool(attrs):
            for col in attrs:
                self._output.write(u"<th>%s</th>" % col)
        self._output.write("</tr>")

    def _html_row(self, obj):
        yield '\n<tr>'
        for attr, col in obj._asdict().iteritems():  # TODO: zip(_fields, ...)
            yield "<td%s>" % (
                attr is not None and (' class="%s"' % attr) or '')
            if hasattr(col, '__iter__'):
                for value in col:
                    yield u'<span>%s</span>' % value
            else:
                # TODO
                yield u'%s' % (
                    col and hasattr(col, 'rstrip') and col.rstrip()
                    or str(col))
            yield "</td>"
        yield "</tr>"

[docs]    def write(self, obj):
        return self._output.write(u''.join(self._html_row(obj,)))

[docs]    def footer(self):
        self._output.write('</table>\n')


[docs]class ResultWriter_checkbox(ResultWriter):
    filetype = 'checkbox'

    def _checkbox_row(self, obj, wrap=79):
        yield u'\n'.join(textwrap.wrap(
            unicode(obj),
            initial_indent=u'- [ ] ',
            subsequent_indent=u'      '
        ))
        yield '\n'

[docs]    def write(self, obj):
        return self._output.write(u''.join(self._checkbox_row(obj)))


[docs]def get_option_parser():
    import optparse
    prs = optparse.OptionParser(
        usage="%prog: [options] \"<command>\"",
        epilog=EPILOG)

    prs.add_option('-f',
                   dest='file',
                   action='store',
                   default='-',
                   help="Input file (default: '-' for stdin)")

    prs.add_option('-o', '--output-file',
                   dest='output',
                   action='store',
                   default='-',
                   help="Output file (default: '-' for stdout)")
    prs.add_option('-O', '--output-filetype',
                   dest='output_filetype',
                   action='store',
                   default='txt',
                   help="Output filetype <txt|csv|tsv|json> (default: txt)")

    prs.add_option('-F', '--input-delim',
                   dest='idelim',
                   action='store',
                   default=None,
                   help=('Strings input field delimiter to split line'
                         ' into ``words`` by'
                         ' (default: None (whitespace)``'))
    prs.add_option('-d', '--output-delim',
                   dest='odelim',
                   default="\t",
                   help=('String output delimiter for lists and tuples'
                         ' (default: \t (tab))``'))

    prs.add_option('-m', '--modules',
                   dest='modules',
                   action='append',
                   default=[],
                   help='Module name to import (default: []) see -p and -r')

    prs.add_option('-p', '--path-tools',
                   dest='path_tools',
                   action='store_true',
                   help='Create path objects from each ``line``')

    prs.add_option('-r', '--regex',
                   dest='regex',
                   action='store',
                   help='Regex to compile and match as ``rgx``')
    prs.add_option('-R', '--regex-options',
                   dest='regex_options',
                   action='store',
                   default='im',
                   help='Regex options: I L M S X U (see ``pydoc re``)')

    prs.add_option("-s", "--sort-asc",
                   dest="sort_asc",
                   action='store',
                   help="Sort Ascending by field number")
    prs.add_option("-S", "--sort-desc",
                   dest="sort_desc",
                   action='store',
                   help="Reverse the sort order")

    prs.add_option('-n', '--number-lines',
                   dest='number_lines',
                   action='store_true',
                   help='Print line numbers of matches')

    prs.add_option('-i', '--ipython',
                   dest='start_ipython',
                   action='store_true',
                   help='Start IPython with results')

    prs.add_option('-v', '--verbose',
                   dest='verbose',
                   action='store_true',)
    prs.add_option('-q', '--quiet',
                   dest='quiet',
                   action='store_true',)
    prs.add_option('-t', '--test',
                   dest='run_tests',
                   action='store_true',)

    return prs


[docs]def get_sort_function(opts):  # (sort_asc, sort_desc)
    # FIXME
    sortfunc = None
    if opts.sort_asc:
        logging.debug("sort_asc: %r" % opts.sort_asc)
        if sortfunc is None:
            sortfunc = (
                lambda _output:
                sort_by(opts.sort_asc,
                        _output,
                        reverse=False))
        else:
            sortfunc = (
                lambda _output:
                sort_by(opts.sort_asc, sortfunc(_output)))
    if opts.sort_desc:
        logging.debug("sort_desc: %r" % opts.sort_desc)
        if sortfunc is None:
            sortfunc = (
                lambda _output:
                sort_by(opts.sort_desc,
                        _output,
                        reverse=True))
        else:
            sortfunc = (
                lambda _output:
                sort_by(opts.sort_desc,
                        sortfunc(_output)))
    return sortfunc


[docs]def main(*args):
    import logging
    import sys

    prs = get_option_parser()

    args = args and list(args) or sys.argv[1:]
    (opts, args) = prs.parse_args(args)

    if not opts.quiet:
        logging.basicConfig()

        if opts.verbose:
            logging.getLogger().setLevel(logging.DEBUG)
            logging.debug(opts.__dict__)

    if opts.run_tests:
        import sys
        sys.argv = [sys.argv[0]] + args
        import unittest
        exit(unittest.main())

    sortfunc = get_sort_function(opts)

    cmd = ' '.join(args)
    if not cmd.strip():
        if opts.regex:
            if opts.output_filetype == 'json' and '<' in opts.regex:
                cmd = 'rgx and rgx.groupdict()'
            else:
                cmd = 'rgx and rgx.groups()'
        else:
            cmd = 'line'

    cmd = cmd.strip()
    opts.cmd = cmd

    if opts.verbose:
        logging.debug(opts.__dict__)

    opts.attrs = PylineResult._fields

    try:
        if opts.file is '-':
            opts._file = sys.stdin
        else:
            opts._file = open(opts.file, 'r')

        if opts.output is '-':
            opts._output = sys.stdout
        else:
            opts._output = open(opts.output, 'w')

        writer, output_func = ResultWriter.get_writer(
            opts._output,
            filetype=opts.output_filetype,
            number_lines=opts.number_lines,
            attrs=opts.attrs)
        writer.header()

        # if not sorting, return a result iterator
        if not sortfunc:
            for result in pyline(opts._file, **opts.__dict__):
                if not result.result:
                    # skip result if not bool(result.result)
                    continue  # TODO

                output_func(result)
        # if sorting, return the sorted list
        else:
            results = []
            for result in pyline(opts._file, **opts.__dict__):
                if not result.result:
                    # skip result if not bool(result.result)
                    continue
                results.append(result)
            for result in sortfunc(results):
                output_func(result)

        writer.footer()
    finally:
        if getattr(opts._file, 'fileno', int)() not in (0, 1, 2):
            opts._file.close()

        if opts.output != '-':
            opts._output.close()


if __name__ == "__main__":
    main()