I recently published a library for advanced awk-like file manipulation in Python 3. The code can be foundhere andhere is the documentation. It is also available for download from pip (pip install awk). I would like to know if the code is well designed and how it can be improved to enforce readability and code reuse. I would also like to know if efficiency can be improved keeping in mind that it should be able to handle large files.
import refrom itertools import zip_longestfrom collections import OrderedDictclass FileNotOpenException(Exception): passclass FieldNotFoundException(Exception): passDEFAULT_FIELD_SEP = r'\s+'def _DEFAULT_FIELD_FUNC(field_key, field): return fielddef _DEFAULT_FIELD_FILTER(field_key, field): return Truedef _DEFAULT_RECORD_FUNC(NR, record): return recorddef _DEFAULT_RECORD_FILTER(NR, record): return Trueclass Record(object): def __init__(self): """Initialises a Record object""" self._field_dict = {} self._field_list = [] self._key_list = [] self._iterator = None def __getitem__(self, key): """Allows access to fields in the following forms: - record[2] # column indices start from 0 - record[4:7:2] # same as above - record['$4'] # same as record[3] - record['mykey'] # columns are indexed based on header, if present """ try: try: return self._field_dict[key] except (KeyError, TypeError): # nonexisting key or slice, respectively return self._field_list[key] except IndexError: raise FieldNotFoundException('No field {} in record'.format(key)) def __setitem__(self, key, val): """should never be done manually, better create a new record than modifying an existing one""" self._field_dict[key] = val self._key_list.append(key) self._field_list.append(val) def add(self, val): """should never be done manually, better create a new record than modifying an existing one""" self['${}'.format(len(self._field_list) + 1)] = val def fields(self): """returns a generator of the record's fields""" yield from self._field_list def keys(self): """returns a generator of the record's keys""" yield from self._key_list def __iter__(self): """returns an iterator over the record's keys""" self._iterator = iter(self._key_list) return self def __next__(self): """returns the next (key, field) pair. If a header was provided, the key corresponds to the header otherwise it is of the form $1, $2, ..., $NF""" try: next_key = next(self._iterator) return next_key, self._field_dict[next_key] except StopIteration: self._iterator = None raise StopIteration def __len__(self): return len(self._field_list) @property def NF(self): """same as awk's NF variable""" return len(self) def __bool__(self): return bool(len(self)) def __str__(self): return 'Record({})'.format(', '.join(['{}: {}'.format(key, self._field_dict[key]) for key in self._key_list]))class Reader(object): # TODO: add field type def __init__(self, filename, fs=DEFAULT_FIELD_SEP, header=False, max_lines=None, field_filter=_DEFAULT_FIELD_FILTER, record_filter=_DEFAULT_RECORD_FILTER): """Initialises a Reader Arguments: filename -- the name of the file to parse Keyword arguments: fs -- regex that separates the fields header -- if set to True, the reader interprets the first line of the file as a header. In this case every record is returned as a dictionary and every field in the header is used as the key of the corresponding field in the following lines max_lines -- the maximum number of lines to read field_filter -- a function f(key, field) which is applied to the field. If it returns a falsy value, the field is not included in the record. default: lambda *args: True record_filter -- a function f(NR, field) which is applied to the record. If it returns a falsy value, the record is not returned. default: lambda *args: True """ self.filename = filename self.header = header self.fs = fs self.max_lines = max_lines self.field_filter = field_filter self.record_filter = record_filter self._compiled_fs = re.compile(fs) self._openfile = None self._keys = None @property def keys(self): """returns the keys of the header if present, otherwise None""" return self._keys def __enter__(self): self._openfile = open(self.filename) self.lines = 0 if self.header: first_line = next(self._openfile).rstrip() self._keys = tuple(self._compiled_fs.split(first_line)) return self def __exit__(self, *args): self._openfile.close() self.lines = 0 self._openfile = None def __iter__(self): return self def _get_record(self, fields): record = Record() if self.header: if len(fields) > len(self._keys): zip_func = zip else: zip_func = zip_longest for key, value in zip_func(self._keys, fields): if self.field_filter(key, value): record[key] = value else: # indexes start from 0 for key, value in enumerate(fields): if self.field_filter(key, value): record.add(value) return record def _get_next(self): if self._openfile is None: raise FileNotOpenException if self.max_lines is not None and self.lines >= self.max_lines: raise StopIteration line = next(self._openfile).rstrip() fields = self._compiled_fs.split(line) record = self._get_record(fields) self.lines += 1 if not self.record_filter(self.lines, record): return None return record def __next__(self): record = self._get_next() while record is None: # skip filtered out lines record = self._get_next() return recordclass Parser(object): def __init__(self, filename, fs=DEFAULT_FIELD_SEP, header=False, max_lines=None, field_func=_DEFAULT_FIELD_FUNC, record_func=_DEFAULT_RECORD_FUNC, field_pre_filter=_DEFAULT_FIELD_FILTER, record_pre_filter=_DEFAULT_RECORD_FILTER, field_post_filter=_DEFAULT_FIELD_FILTER, record_post_filter=_DEFAULT_RECORD_FILTER): """Initialise a Parser Arguments: filename -- the name of the file to parse Keyword arguments: fs -- a regex that separates the fields header -- if set to True, the parser interprets the first line of the file as a header. In this case every record is returned as a dictionary and every field in the header is used as the key of the corresponding field in the following lines max_lines -- the maximum number of lines to parse field_func -- a function f(field_key, field) which is applied to every field, field_key is the number of the field if there is no header, the corresponding header key otherwise. default: a function that returns the field record_func -- a function f(NR, NF, field) which is applied to every record, NR is the record number NF is the total number of fields in the record. default: a function that returns the record field_pre_filter -- a function f(field_key, field) which is applied to the field before `field_func`. If it returns a falsy value, the field is not returned. default: lambda *args: True record_pre_filter -- a function f(NR, field) which is applied to the record before `record_func`. If it returns a falsy value, the record is not returned default: lambda *args: True field_post_filter -- a function f(field_key, field) which is applied to the field after `field_func`. If it returns a falsy value, the field is not returned. default: lambda *args: True record_post_filter -- a function f(NR, field) which is applied to the record after `record_func`. If it returns a falsy value, the record is not returned default: lambda *args: True """ self.filename = filename self.header = header self.fs = fs self.max_lines = max_lines self.field_func = field_func self.record_func = record_func self.field_pre_filter = field_pre_filter self.record_pre_filter = record_pre_filter self.field_post_filter = field_post_filter self.record_post_filter = record_post_filter def _parse_fields(self, record): new_record = Record() for key, field in record: new_field = self.field_func(key, field) if self.field_post_filter(key, new_field): new_record[key] = new_field return new_record def parse(self): """Parse the file provided at initialisation time returns a generator of `Record`s. The records returned and the fields in them are the result of the application of record_func and field_func respectively. Only records respecting the pre and post filters are present, same applies for the fields in each record """ reader_args = (self.filename, self.fs, self.header, self.max_lines, self.field_pre_filter, self.record_pre_filter) with Reader(*reader_args) as reader: for nr, record in enumerate(reader, 1): # line numbers start from 1 record = self.record_func(nr, self._parse_fields(record)) if self.record_post_filter(nr, record): yield recordclass Column(object): def __init__(self, filename, fs=DEFAULT_FIELD_SEP, header=False, max_lines=None, field_func=lambda x: x, column_func=lambda x: x): """ Initialise a Column object. Arguments: filename -- the name of the file to parse Keyword arguments: fs -- a regex that separates the fields header -- if set to True, the parser interprets the first line of the file as a header. In this case the columns can be indexed as the key specified in the header and the first element of the column is the header max_lines -- the maximum number of lines to parse field_func -- a function f(field) which is applied to every field. Default: a function that returns the field column_func -- a function f(column) which is applied to every clumn before returning it. Default: a function that returns the field """ self.filename = filename self.fs = fs self.header = header self.max_lines = max_lines self.field_func = field_func self.column_func = column_func def __getitem__(self, index): """ if index is a slice, it returns a tuple of columns, where each column is the result of the application of `column_func()` on the column. If `header` is True, `index` must be a key in the header, otherwise it can be an integer. In those cases, the result of the application of `column_func()` on the single column is returned. `field_func()` is applied to every field in the column(s). In the case of slicing, indexes start from 0 to make slicing simpler. Please note that this function needs to parse the whole file unless max_lines is specified in the constructor """ parser = Parser(self.filename, self.fs, self.header, max_lines=self.max_lines, field_func=lambda key, field: self.field_func(field)) if isinstance(index, slice): columns = OrderedDict() for record in parser.parse(): for i, field in enumerate(list(record.fields())[index]): try: columns[i].append(field) except KeyError: columns[i] = [field] # post-processing return [self.column_func(tuple(column)) for column in columns.values()] else: column = [] for record in parser.parse(): try: fields = list(record.fields())[index] column.append(fields) except IndexError: column.append(None) return self.column_func(tuple(column)) def get(self, *keys): """ returns a generator of tuples where every element in the tuple is the field of the corresponding column. For example, if passed three keys, every tuple will have three elements. Please note that this function needs to parse the whole file unless max_lines is specified in the constructor """ parser = Parser(self.filename, self.fs, self.header, field_pre_filter=lambda key, field: key in keys) for record in parser.parse(): yield tuple(record.fields())Some usage examples (you can find more inthe docs). Examples assume the following file, calledtestinput.
A B C D E F G2 8 0 0 5 7 73 0 7 0 0 7 02 3 5 6 6 6 80 2 1 0 8 3 7Simple reader:
from awk import Readerwith Reader('testinput') as reader: for record in reader: print(record)Output:
Record($1: A, $2: B, $3: C, $4: D, $5: E, $6: F, $7: G)Record($1: 2, $2: 8, $3: 0, $4: 0, $5: 5, $6: 7, $7: 7)Record($1: 3, $2: 0, $3: 7, $4: 0, $5: 0, $6: 7, $7: 0)Record($1: 2, $2: 3, $3: 5, $4: 6, $5: 6, $6: 6, $7: 8)Record($1: 0, $2: 2, $3: 1, $4: 0, $5: 8, $6: 3, $7: 7)Reader with a header:
from awk import Readerwith Reader('testinput', header=True) as reader: for record in reader: print(record)Output:
Record(A: 2, B: 8, C: 0, D: 0, E: 5, F: 7, G: 7)Record(A: 3, B: 0, C: 7, D: 0, E: 0, F: 7, G: 0)Record(A: 2, B: 3, C: 5, D: 6, E: 6, F: 6, G: 8)Record(A: 0, B: 2, C: 1, D: 0, E: 8, F: 3, G: 7)# a field can be accessed as: record[0], record['$1'], record['A']# slicing is also supported: record[1:5:2]This makes every record the sum of its squared fields:
from awk import Parserparser = Parser('testinput', header=True, field_func=lambda key, field: int(field)**2, record_func=lambda nr, nf, record: sum(record.values()))for record in parser.parse(): print(record)Output:
191107210127Simple Column usage:
from awk import Columncolumns = Column('testinput')print(list(columns[3]))Output:
('D', '0', '0', '6', '0')Column with header:
from awk import Columncolumns = Column('testinput', header=True)for line in columns.get('A', 'C', 'E'): print(line)Output:
('2', '0', '5')('3', '7', '0')('2', '5', '6')('0', '1', '8')- \$\begingroup\$@Graipher I have added some examples from the docs\$\endgroup\$Simone Bronzini– Simone Bronzini2016-11-03 13:42:59 +00:00CommentedNov 3, 2016 at 13:42
1 Answer1
Concept
In many ways, the functionality of this library resembles that of the built-incsv module. The main difference is that here you split by regex rather than on a specific character. I think that the design would be improved by modelling your code after thecsv module — for example, by having a separateReader andDictReader.
The fact that theReader accepts afilename as input limits the applicability of this code. What if I want to parse data coming from a network stream? It can't be done without first writing to a temporary file.
Thefield numbering convention is very confusing in my opinion:
"""- record['$4'] # same as record[3]"""
record['$0'] doesn't retrieve the original text as I would expect.
You should either give up the AWK-inspired'$4' notation (for which I don't see much value) or fully embrace the one-based column numbering (which does have some precedent in Pythonregular expressions).
Thefilter functions make theParser do much more than parsing, violating the Single Responsibility Principle. In addition, the filtering makes it unclear how record numbering works, or what you mean by the "next" record. I think you would be better off dropping the feature, since Python's generator expressions offer much of the same functionality.
Iterators
Your iterator implementation is more complicated than necessary, and in fact wrong.
Here's how iterators should behave:
>>> words = 'The quick brown fox jumps over the lazy dog'.split()>>> iter1 = iter(words)>>> iter2 = iter(words)>>> next(iter1)'The'>>> next(iter1)'quick'>>> next(iter1)'brown'>>> next(iter2)'The'However, if I ask for two iterators on the sameRecord, they actually interfere with each other:
>>> from awk import Reader>>> with Reader('fox.txt') as reader:... record = next(reader)... >>> str(record)'Record($1: The, $2: quick, $3: brown, $4: fox, $5: jumps, $6: over, $7: the, $8: lazy, $9: dog)'>>> iter1 = iter(record)>>> iter2 = iter(record)>>> next(iter1)('$1', 'The')>>> next(iter1)('$2', 'quick')>>> next(iter1)('$3', 'brown')>>> next(iter2)('$4', 'fox')To support iteration, you didn't need to write a__next__ method; all you needed was this:
class Record: … def __iter__(self): """Return an iterator over the record's keys""" return ((key, self._field_dict[key]) for key in self._key_list)- \$\begingroup\$Thank you for the great insight! You implementation of the iterator is definitely the correct one, I will soon change the code to comply with it. As for the indexing, I think you are right, but here is the reasoning: initially I wanted to have indexes starting from 1, so that
record['$3'] is record[3]but then I wanted to add the slicing feature but I thought that it would have been pretty confusing with indexes starting from 1. So, as a compromise, I decided to keep the "awk notation" for the default keys and keepthe integer indexes starting from 0. (continues on next comment)\$\endgroup\$Simone Bronzini– Simone Bronzini2016-11-07 19:02:44 +00:00CommentedNov 7, 2016 at 19:02 - \$\begingroup\$(continues from previous comment) Do you have an idea on how to handle indexes starting from 1 but making slicing not completely counterintuitive? As for the other suggestions (taking something that is not a filename as input and removing filters from the
Parser), I will indeed look into them. I'll probably mark this as the accepted answer but I would rather wait a bit and see if other detailed answers arrive!\$\endgroup\$Simone Bronzini– Simone Bronzini2016-11-07 19:04:03 +00:00CommentedNov 7, 2016 at 19:04 - \$\begingroup\$Why is the
record['$3']notation important to you? Do you have a use case to demonstrate how it is useful?\$\endgroup\$200_success– 200_success2016-11-07 19:05:48 +00:00CommentedNov 7, 2016 at 19:05 - \$\begingroup\$yeah, right, I forgot to address this point, as a future development I would like to build the Record so that
record['$0']is the whole line. That takes quite a bit of refactoring at the moment but I think it would be useful\$\endgroup\$Simone Bronzini– Simone Bronzini2016-11-07 19:08:53 +00:00CommentedNov 7, 2016 at 19:08 - \$\begingroup\$Yes, I know that's how AWK works, but you still haven't explainedwhy you want to make Python behave like AWK. If you insist on one-based numbering, then perhaps you could use
record.awk_var('$3')orrecord.awk_var(3)to avoid the zero-based semantics that the[]operator implies.\$\endgroup\$200_success– 200_success2016-11-07 19:15:11 +00:00CommentedNov 7, 2016 at 19:15
You mustlog in to answer this question.
Explore related questions
See similar questions with these tags.

