# parseRegRec-1a.py David MacQuigg (c) 4/16/06 # This code is open-source on the same terms as Python. ''' Routines to parse a record from the Registry of Public Email Senders. parse1 - returns a dictionary of top-level key=value pairs. parse2 - returns a list of items in a selected string value. ''' # 03/31/06 1a - Modify parse2 for more generalized syntax. import re # Regular Expressions for pattern matching RE_KWEQ = re.compile(r'(\w+)\s*=\s*') # Keyword= [a-zA-Z0-9_] = RE_QS = re.compile(r"^\'(.*)\'\s*", re.DOTALL) # 'Quoted string' RE_UQ = re.compile(r'^(\S+)\s*') # Unquoted_string class ParseError(Exception): pass def parse1( record ): '''Parse a Registry record to a dictionary of key=value pairs. Values may be quoted strings containing anything but quotes, or unquoted strings containing anything but whitespace characters. Quoted strings begin with a single quote immediately after the = sign. Double quotes inside a string should be avoided, because they are used in many DNS record editors to enclose an entire TXT record. Because a record can contain both quoted and unquoted strings, it is necessary to parse this first level in a left-to-right sequence. The interpretation of characters in a given substring depends on everything to the left. A stray quote or space or a missing = can mess up the entire remaining "tail" of the record. The second level, 'parse2' follows a more tolerant parsing technique, using separators comma, colon, and plus. >>> parse1("svc=S1:A,M2+:A,H1+2:B ip4='66.160.67.184/29 216.183.69.43' ") {'svc': 'S1:A,M2+:A,H1+2:B', 'ip4': '66.160.67.184/29 216.183.69.43'} ''' parsedrec = {} tail = record # initially the whole record MAXPAIRS = 100 try: for m in range(MAXPAIRS): if tail == '': break # the normal exit kw, tail = RE_KWEQ.split(tail, 1)[1:] if tail[0] == "'": # Quoted String val, tail = RE_QS.split(tail, 1)[1:] else: # Unquoted String val, tail = RE_UQ.split(tail, 1)[1:] parsedrec[kw] = val else: expln = "Error in Registry record. # items > %s" % MAXPAIRS raise ParseError, expln except: expln = "Problem parsing Registry record. tail = '%s'" % tail raise ParseError, expln return parsedrec def parse2(valstr): '''Parse a value string in the form kw+num:uqs,kw+num:uqs ... kw is an alphanumeric keyword. num is a decimal integer with default 0 (or 1 for a bare + sign). uqs is an unquoted string with default value ''. uqs must not contain comma. >>> parse2('S1,XJ:25,XK+5,H1+2:B') [['S1', 0, ''], ['XJ', 0, '25'], ['XK', 5, ''], ['H1', 2, 'B']] >>> parse2('A:B:C:D,3S:5 R,M2+:777') [['A', 0, 'B:C:D'], ['3S', 0, '5 R'], ['M2', 1, '777']] >>> parse2('S+3.0:777') Traceback (most recent call last): - - - num must be a decimal integer ParseError: Problem parsing 'S+3.0:777' >>> parse2('CSV+3+2:xyz') Traceback (most recent call last): - - - multiple + separators not allowed ParseError: Problem parsing 'CSV+3+2:xyz' ''' vlist = [] try: items = valstr.split(',') # [kw+num:uqs, kw+num:uqs, ...] for i in items: split1 = i.split(':', 1 ) # [kw+num, uqs] knum = split1[0] if len(split1) == 1: uqs = '' # [kw+num, ''] else: uqs = split1[1] split2 = knum.split('+', 1 ) # [kw, num] kw = split2[0] if len(split2) == 1: num = 0 # [kw, 0] else: if split2[1] == '': num = 1 # [kw, 1] else: num = int(split2[1]) vlist.append([kw, num, uqs]) except: expln = "Problem parsing '%s'" % i print split2 ### raise ParseError, expln return vlist if __name__ == '__main__': record1 = 'svc=S1:A,M2+:A,H1+2:B ip4=192.168.1.1/29' record2 = '''svc=S1:A,M2:A,H1+:B mth=SPF+5,DK \ SPF='v=spf1 mx include:s._spf.test.com include:m._spf.test.com \ include:p._spf.test.com include:c._spf.test.com ~all' \ DK=k=rsa;p=MHwwDQYJKoZIhvcNAQEBBQADawAwaAJhAKJ2lzDLZ8XlVambQfMXn3LRGKOD5\ wDQYJKQYJKoZIhvcNAQEBBQADawA''' recordx = 'svc=S1:A,M2+:A,H1+2:B ... a bad ending.' def test_parse(record): '''Parse a Registry record down two levels. Parameters for specific services and methods are kept as a simple string, to be parsed later by routines for the specific method.''' parsedrec = parse1(record) for key in parsedrec: print "%s=%s" % (key, parsedrec[key]) if parsedrec.has_key('svc'): svc = parse2(parsedrec['svc']) print '==> svc =', svc if parsedrec.has_key('mth'): mth = parse2(parsedrec['mth']) print '==> mth =', mth __test__ = {} # extra tests for doctest.testmod __test__['test_records'] = ''' >>> test_parse(record1) svc=S1:A,M2+:A,H1+2:B ip4=192.168.1.1/29 ==> svc = [['S1', 0, 'A'], ['M2', 1, 'A'], ['H1', 2, 'B']] >>> test_parse(record2) mth=SPF+5,DK svc=S1:A,M2:A,H1+:B SPF=v=spf1 mx include:s._spf.test.com include:m._spf.test.com include:p._spf.test.com include:c._spf.test.com ~all DK=k=rsa;p=MHwwDQYJKoZIhvcNAQEBBQADawAwaAJhAKJ2lzDLZ8XlVambQfMXn3LRGKOD5wDQYJKQYJKoZIhvcNAQEBBQADawA ==> svc = [['S1', 0, 'A'], ['M2', 0, 'A'], ['H1', 1, 'B']] ==> mth = [['SPF', 5, ''], ['DK', 0, '']] >>> parse1(recordx) Traceback (most recent call last): - - - ParseError: Problem parsing Registry record. tail = '... a bad ending.' ''' import sys, doctest doctest.testmod(sys.modules['__main__'], verbose=True)