# Copyright 2013 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import copy import datetime import logging import os import re import time from lib.bucket import BUCKET_ID from lib.exceptions import EmptyDumpException, InvalidDumpException from lib.exceptions import ObsoleteDumpVersionException, ParsingException from lib.pageframe import PageFrame from lib.range_dict import ExclusiveRangeDict from lib.symbol import proc_maps LOGGER = logging.getLogger('dmprof') # Heap Profile Dump versions # DUMP_DEEP_[1-4] are obsolete. # DUMP_DEEP_2+ distinct mmap regions and malloc chunks. # DUMP_DEEP_3+ don't include allocation functions in their stack dumps. # DUMP_DEEP_4+ support comments with '#' and global stats "nonprofiled-*". # DUMP_DEEP_[1-2] should be processed by POLICY_DEEP_1. # DUMP_DEEP_[3-4] should be processed by POLICY_DEEP_2 or POLICY_DEEP_3. DUMP_DEEP_1 = 'DUMP_DEEP_1' DUMP_DEEP_2 = 'DUMP_DEEP_2' DUMP_DEEP_3 = 'DUMP_DEEP_3' DUMP_DEEP_4 = 'DUMP_DEEP_4' DUMP_DEEP_OBSOLETE = (DUMP_DEEP_1, DUMP_DEEP_2, DUMP_DEEP_3, DUMP_DEEP_4) # DUMP_DEEP_5 doesn't separate sections for malloc and mmap. # malloc and mmap are identified in bucket files. # DUMP_DEEP_5 should be processed by POLICY_DEEP_4. DUMP_DEEP_5 = 'DUMP_DEEP_5' # DUMP_DEEP_6 adds a mmap list to DUMP_DEEP_5. DUMP_DEEP_6 = 'DUMP_DEEP_6' class Dump(object): """Represents a heap profile dump.""" _PATH_PATTERN = re.compile(r'^(.*)\.([0-9]+)\.([0-9]+)\.heap$') _HOOK_PATTERN = re.compile( r'^ ([ $])([a-f0-9]+)([ $])-([ $])([a-f0-9]+)([ $])\s+' r'(hooked|unhooked)\s+(.+)$', re.IGNORECASE) _HOOKED_PATTERN = re.compile(r'(?P.+ )?(?P[0-9]+) / ' '(?P[0-9]+) @ (?P[0-9]+)') _UNHOOKED_PATTERN = re.compile(r'(?P.+ )?(?P[0-9]+) / ' '(?P[0-9]+)') _OLD_HOOKED_PATTERN = re.compile(r'(?P.+) @ (?P[0-9]+)') _OLD_UNHOOKED_PATTERN = re.compile(r'(?P.+) (?P[0-9]+)') _TIME_PATTERN_FORMAT = re.compile( r'^Time: ([0-9]+/[0-9]+/[0-9]+ [0-9]+:[0-9]+:[0-9]+)(\.[0-9]+)?') _TIME_PATTERN_SECONDS = re.compile(r'^Time: ([0-9]+)$') def __init__(self, path, modified_time): self._path = path matched = self._PATH_PATTERN.match(path) self._pid = int(matched.group(2)) self._count = int(matched.group(3)) self._time = modified_time self._map = {} self._procmaps = ExclusiveRangeDict(ProcMapsEntryAttribute) self._stacktrace_lines = [] self._global_stats = {} # used only in apply_policy self._run_id = '' self._pagesize = 4096 self._pageframe_length = 0 self._pageframe_encoding = '' self._has_pagecount = False self._version = '' self._lines = [] @property def path(self): return self._path @property def count(self): return self._count @property def time(self): return self._time @property def iter_map(self): for region in sorted(self._map.iteritems()): yield region[0], region[1] def iter_procmaps(self): for begin, end, attr in self._map.iter_range(): yield begin, end, attr @property def iter_stacktrace(self): for line in self._stacktrace_lines: yield line def global_stat(self, name): return self._global_stats[name] @property def run_id(self): return self._run_id @property def pagesize(self): return self._pagesize @property def pageframe_length(self): return self._pageframe_length @property def pageframe_encoding(self): return self._pageframe_encoding @property def has_pagecount(self): return self._has_pagecount @staticmethod def load(path, log_header='Loading a heap profile dump: '): """Loads a heap profile dump. Args: path: A file path string to load. log_header: A preceding string for log messages. Returns: A loaded Dump object. Raises: ParsingException for invalid heap profile dumps. """ dump = Dump(path, os.stat(path).st_mtime) with open(path, 'r') as f: dump.load_file(f, log_header) return dump def load_file(self, f, log_header): self._lines = [line for line in f if line and not line.startswith('#')] try: self._version, ln = self._parse_version() self._parse_meta_information() if self._version == DUMP_DEEP_6: self._parse_mmap_list() self._parse_global_stats() self._extract_stacktrace_lines(ln) except EmptyDumpException: LOGGER.info('%s%s ...ignored an empty dump.' % (log_header, self._path)) except ParsingException, e: LOGGER.error('%s%s ...error %s' % (log_header, self._path, e)) raise else: LOGGER.info('%s%s (version:%s)' % (log_header, self._path, self._version)) def _parse_version(self): """Parses a version string in self._lines. Returns: A pair of (a string representing a version of the stacktrace dump, and an integer indicating a line number next to the version string). Raises: ParsingException for invalid dump versions. """ version = '' # Skip until an identifiable line. headers = ('STACKTRACES:\n', 'MMAP_STACKTRACES:\n', 'heap profile: ') if not self._lines: raise EmptyDumpException('Empty heap dump file.') (ln, found) = skip_while( 0, len(self._lines), lambda n: not self._lines[n].startswith(headers)) if not found: raise InvalidDumpException('No version header.') # Identify a version. if self._lines[ln].startswith('heap profile: '): version = self._lines[ln][13:].strip() if version in (DUMP_DEEP_5, DUMP_DEEP_6): (ln, _) = skip_while( ln, len(self._lines), lambda n: self._lines[n] != 'STACKTRACES:\n') elif version in DUMP_DEEP_OBSOLETE: raise ObsoleteDumpVersionException(version) else: raise InvalidDumpException('Invalid version: %s' % version) elif self._lines[ln] == 'STACKTRACES:\n': raise ObsoleteDumpVersionException(DUMP_DEEP_1) elif self._lines[ln] == 'MMAP_STACKTRACES:\n': raise ObsoleteDumpVersionException(DUMP_DEEP_2) return (version, ln) def _parse_global_stats(self): """Parses lines in self._lines as global stats.""" (ln, _) = skip_while( 0, len(self._lines), lambda n: self._lines[n] != 'GLOBAL_STATS:\n') global_stat_names = [ 'total', 'absent', 'file-exec', 'file-nonexec', 'anonymous', 'stack', 'other', 'nonprofiled-absent', 'nonprofiled-anonymous', 'nonprofiled-file-exec', 'nonprofiled-file-nonexec', 'nonprofiled-stack', 'nonprofiled-other', 'profiled-mmap', 'profiled-malloc'] for prefix in global_stat_names: (ln, _) = skip_while( ln, len(self._lines), lambda n: self._lines[n].split()[0] != prefix) words = self._lines[ln].split() self._global_stats[prefix + '_virtual'] = int(words[-2]) self._global_stats[prefix + '_committed'] = int(words[-1]) def _parse_meta_information(self): """Parses lines in self._lines for meta information.""" (ln, found) = skip_while( 0, len(self._lines), lambda n: self._lines[n] != 'META:\n') if not found: return ln += 1 while True: if self._lines[ln].startswith('Time:'): matched_seconds = self._TIME_PATTERN_SECONDS.match(self._lines[ln]) matched_format = self._TIME_PATTERN_FORMAT.match(self._lines[ln]) if matched_format: self._time = time.mktime(datetime.datetime.strptime( matched_format.group(1), '%Y/%m/%d %H:%M:%S').timetuple()) if matched_format.group(2): self._time += float(matched_format.group(2)[1:]) / 1000.0 elif matched_seconds: self._time = float(matched_seconds.group(1)) elif self._lines[ln].startswith('Reason:'): pass # Nothing to do for 'Reason:' elif self._lines[ln].startswith('PageSize: '): self._pagesize = int(self._lines[ln][10:]) elif self._lines[ln].startswith('CommandLine:'): pass elif (self._lines[ln].startswith('PageFrame: ') or self._lines[ln].startswith('PFN: ')): if self._lines[ln].startswith('PageFrame: '): words = self._lines[ln][11:].split(',') else: words = self._lines[ln][5:].split(',') for word in words: if word == '24': self._pageframe_length = 24 elif word == 'Base64': self._pageframe_encoding = 'base64' elif word == 'PageCount': self._has_pagecount = True elif self._lines[ln].startswith('RunID: '): self._run_id = self._lines[ln][7:].strip() elif (self._lines[ln].startswith('MMAP_LIST:') or self._lines[ln].startswith('GLOBAL_STATS:')): # Skip until "MMAP_LIST:" or "GLOBAL_STATS" is found. break else: pass ln += 1 def _parse_mmap_list(self): """Parses lines in self._lines as a mmap list.""" (ln, found) = skip_while( 0, len(self._lines), lambda n: self._lines[n] != 'MMAP_LIST:\n') if not found: return {} ln += 1 self._map = {} current_vma = {} pageframe_list = [] while True: entry = proc_maps.ProcMaps.parse_line(self._lines[ln]) if entry: current_vma = {} for _, _, attr in self._procmaps.iter_range(entry.begin, entry.end): for key, value in entry.as_dict().iteritems(): attr[key] = value current_vma[key] = value ln += 1 continue if self._lines[ln].startswith(' PF: '): for pageframe in self._lines[ln][5:].split(): pageframe_list.append(PageFrame.parse(pageframe, self._pagesize)) ln += 1 continue matched = self._HOOK_PATTERN.match(self._lines[ln]) if not matched: break # 2: starting address # 5: end address # 7: hooked or unhooked # 8: additional information if matched.group(7) == 'hooked': submatched = self._HOOKED_PATTERN.match(matched.group(8)) if not submatched: submatched = self._OLD_HOOKED_PATTERN.match(matched.group(8)) elif matched.group(7) == 'unhooked': submatched = self._UNHOOKED_PATTERN.match(matched.group(8)) if not submatched: submatched = self._OLD_UNHOOKED_PATTERN.match(matched.group(8)) else: assert matched.group(7) in ['hooked', 'unhooked'] submatched_dict = submatched.groupdict() region_info = { 'vma': current_vma } if submatched_dict.get('TYPE'): region_info['type'] = submatched_dict['TYPE'].strip() if submatched_dict.get('COMMITTED'): region_info['committed'] = int(submatched_dict['COMMITTED']) if submatched_dict.get('RESERVED'): region_info['reserved'] = int(submatched_dict['RESERVED']) if submatched_dict.get('BUCKETID'): region_info['bucket_id'] = int(submatched_dict['BUCKETID']) if matched.group(1) == '(': start = current_vma['begin'] else: start = int(matched.group(2), 16) if matched.group(4) == '(': end = current_vma['end'] else: end = int(matched.group(5), 16) if pageframe_list and pageframe_list[0].start_truncated: pageframe_list[0].set_size( pageframe_list[0].size - start % self._pagesize) if pageframe_list and pageframe_list[-1].end_truncated: pageframe_list[-1].set_size( pageframe_list[-1].size - (self._pagesize - end % self._pagesize)) region_info['pageframe'] = pageframe_list pageframe_list = [] self._map[(start, end)] = (matched.group(7), region_info) ln += 1 def _extract_stacktrace_lines(self, line_number): """Extracts the position of stacktrace lines. Valid stacktrace lines are stored into self._stacktrace_lines. Args: line_number: A line number to start parsing in lines. Raises: ParsingException for invalid dump versions. """ if self._version in (DUMP_DEEP_5, DUMP_DEEP_6): (line_number, _) = skip_while( line_number, len(self._lines), lambda n: not self._lines[n].split()[0].isdigit()) stacktrace_start = line_number (line_number, _) = skip_while( line_number, len(self._lines), lambda n: self._check_stacktrace_line(self._lines[n])) self._stacktrace_lines = self._lines[stacktrace_start:line_number] elif self._version in DUMP_DEEP_OBSOLETE: raise ObsoleteDumpVersionException(self._version) else: raise InvalidDumpException('Invalid version: %s' % self._version) @staticmethod def _check_stacktrace_line(stacktrace_line): """Checks if a given stacktrace_line is valid as stacktrace. Args: stacktrace_line: A string to be checked. Returns: True if the given stacktrace_line is valid. """ words = stacktrace_line.split() if len(words) < BUCKET_ID + 1: return False if words[BUCKET_ID - 1] != '@': return False return True class DumpList(object): """Represents a sequence of heap profile dumps.""" def __init__(self, dump_list): self._dump_list = dump_list @staticmethod def load(path_list): LOGGER.info('Loading heap dump profiles.') dump_list = [] for path in path_list: dump_list.append(Dump.load(path, ' ')) return DumpList(dump_list) def __len__(self): return len(self._dump_list) def __iter__(self): for dump in self._dump_list: yield dump def __getitem__(self, index): return self._dump_list[index] class ProcMapsEntryAttribute(ExclusiveRangeDict.RangeAttribute): """Represents an entry of /proc/maps in range_dict.ExclusiveRangeDict.""" _DUMMY_ENTRY = proc_maps.ProcMapsEntry( 0, # begin 0, # end '-', # readable '-', # writable '-', # executable '-', # private 0, # offset '00', # major '00', # minor 0, # inode '' # name ) def __init__(self): super(ProcMapsEntryAttribute, self).__init__() self._entry = self._DUMMY_ENTRY.as_dict() def __str__(self): return str(self._entry) def __repr__(self): return 'ProcMapsEntryAttribute' + str(self._entry) def __getitem__(self, key): return self._entry[key] def __setitem__(self, key, value): if key not in self._entry: raise KeyError(key) self._entry[key] = value def copy(self): new_entry = ProcMapsEntryAttribute() for key, value in self._entry.iteritems(): new_entry[key] = copy.deepcopy(value) return new_entry def skip_while(index, max_index, skipping_condition): """Increments |index| until |skipping_condition|(|index|) is False. Returns: A pair of an integer indicating a line number after skipped, and a boolean value which is True if found a line which skipping_condition is False for. """ while skipping_condition(index): index += 1 if index >= max_index: return index, False return index, True