#!/Users/krader/symlinks/python3 """Extract data captured by Apache mod_dumpio. Scan the input looking for lines from mod_dumpio representing the input received from the remote system. Capture the data. When the remote client ID changes dump the captured data. Note that we do absolutely everything using binary I/O and byte-arrays rather than text I/O and strings. This is because we don't actually care what encoding is used in the mod_dumpio output or the Apache HTTP error log. We're potentially dealing with non-ASCII and non-UTF-8 data (e.g., binary data such as a Zip archive). Usage: error_log_data_extract < error_log > decoded_data Note that you do not have to pipe the entire error log through this program. You can preprocess the log by, for example, extracting with grep just the lines associated with a given IP address and sending that subset of lines through this program. This program assumes you're running the corrected mod_dumpio module I posted to Apache PR57045 (https://bz.apache.org/bugzilla/show_bug.cgi?id=57045). It can be used on output from the broken mod_dumpio module but it won't, obviously, correctly handle 0x00 or 0xFF bytes. """ import re import sys data_sep_in = (b'=#= input ' + b'=#=' * (70 // 3)) + b'\n' data_sep_out = (b'=#= output ' + b'=#=' * (70 // 3)) + b'\n' dumpio_new_req_re = re.compile(rb' mod_dumpio: new request$') dumpio_re = re.compile(rb'^.*? \[client (?P.+?)\]\s' rb'mod_dumpio:\s+(?Pdumpio_(?:in|out))\s+' rb'\(data-[^)]+\):\s(?P.*)$') escape_re = re.compile(rb'(\\\\|\\x[fF][fF]\\x0[12]|' rb'\\x[0-9a-fA-F][0-9a-fA-F]|\\[abfnrtv"])') hex_escape_to_bin = {'\\x{:02x}'.format(i).encode('ascii'): bytes([i]) for i in range(256)} escape_to_char = {b'\\"': b'"', b'\\\\': b'\\', b'\\a': b'\a', b'\\b': b'\b', b'\\f': b'\f', b'\\n': b'\n', b'\\r': b'\r', b'\\t': b'\t', b'\\v': b'\v', b'\\xff\\x02': b'\xff', b'\\xff\\x01': b'\x00'} escape_to_char.update(hex_escape_to_bin) class ReconstructCapturedData(object): """Capture the data from mod_dumpio and emit it on demand.""" def __init__(self, out_fh): self.out_fh = out_fh self.captured_input = [] self.captured_output = [] def _SubEscapes(self, match): """Convert text escaped by the mod_dumpio module to their original chars.""" _ = self return escape_to_char.get(match.group(0).lower()) def CaptureData(self, which, data): """Capture the data logged by mod_dumpio.""" data = escape_re.sub(self._SubEscapes, data) if which == b'dumpio_in': self.captured_input.append(data) else: self.captured_output.append(data) def _DumpCapturedData(self, data, sep): """Write the unescaped data we captured from the mod_dumpio module.""" if not data: return self.out_fh.write(sep) eol = '' for data_in in data: eol = data_in[-1] if data_in else '' self.out_fh.write(data_in) if eol != b'\n': self.out_fh.write(b'\n') self.out_fh.write(sep) def DumpCapturedData(self): """Write the unescaped data we captured from the mod_dumpio module.""" self._DumpCapturedData(self.captured_input, data_sep_in) self._DumpCapturedData(self.captured_output, data_sep_out) self.captured_input = [] self.captured_output = [] def main(): """Filter an Apache error log containing mod_dumpio data.""" in_fh = open(sys.stdin.fileno(), 'rb', closefd=False) out_fh = open(sys.stdout.fileno(), 'wb', closefd=False) captured_data = ReconstructCapturedData(out_fh) prev_client_id = None for line in in_fh: if line.endswith(b' mod_dumpio: new request\n'): captured_data.DumpCapturedData() out_fh.write(line) continue match = dumpio_re.match(line) if match: client_id = match.group('client_id') which = match.group('which') data = match.group('data') if client_id != prev_client_id and prev_client_id is not None: # This should be a "can't happen" situation if we're parsing # mod_dumpio output that includes the "new request" message. # This will occur for output from older versions of mod_dumpio. captured_data.DumpCapturedData() prev_client_id = client_id captured_data.CaptureData(which, data) out_fh.write(line) captured_data.DumpCapturedData() if __name__ == '__main__': main()