include_analyzer_memoizing_node.py [plain text]
"""A graph-based algorithm for memoizing include closure calculations."""
__author__ = "Nils Klarlund"
import os
import basics
import macro_eval
import parse_file
import statistics
import include_analyzer
Debug = basics.Debug
DEBUG_TRACE = basics.DEBUG_TRACE
DEBUG_DATA = basics.DEBUG_DATA
NotCoveredError = basics.NotCoveredError
RESOLVED = 1 QUOTE = 2 ANGLE = 3 NEXT = 4 RESOLUTION_MODES = [ RESOLVED, QUOTE, ANGLE, NEXT ]
RESOLUTION_MODES_STR = [ None, 'RESOLVED', 'QUOTE', 'ANGLE', 'NEXT' ]
class UnionCache(object):
"""Frozen sets and their unions, represented by integers.
Frozensets are Python's immutable and hashable sets. We hash them into set
ids, which are integers. That allows us to cache union operations efficiently.
"""
def __init__(self):
"""Constructor:
Instance variables:
members: members[set_id] = frozenset([s1,..., sn]), the members of the set
cache: cache[(set1_id, set2_id)] = the id of the union of set1 and set2
id_map: the set of frozen sets we have seen mapped to {1, 2, ..}
"""
self.members = {}
self.cache = {}
self.id_map = {}
def SetId(self, members):
"""Memoize the frozenset of members and return set id."""
frozen = frozenset(members)
try:
return self.id_map[frozen]
except KeyError:
self.id_map[frozen] = len(self.id_map) + 1
self.members[len(self.id_map)] = frozen
return len(self.id_map)
def Elements(self, set_id):
"""The frozenset corresponding to a set id."""
return self.members[set_id]
def Union(self, set1_id, set2_id):
"""Return the set id of the union of sets represented by set ids."""
try:
return self.cache[(set1_id, set2_id)]
except KeyError:
frozen = self.members[set1_id] | self.members[set2_id]
frozen_id = self.SetId(frozen)
self.cache[(set1_id, set2_id)] = frozen_id
return frozen_id
class SupportRecord(object):
"""Record the symbols that expressions depend on.
A support record is an object that contains a mutable support set of symbols.
Each node in the summary graph is associated with a support record. It is the
set of symbols upon which the included computes depend. A support record is
initially deemed valid. If a symbol is redefined, then it becomes invalid.
For efficiency, the valid field is sometimes explicitly handled by a user of
this object.
"""
def __init__(self, support_master):
"""Constructor.
Argument:
support_master: a record for holding the reverse mapping from symbols to
support records that contain them.
Instance Variables:
support_master: see above
union_cache: a union cache for set manipulation
support_id: the id of a set in union_cache; the set consists of all
symbols referenced by computed includes in any include
dependency of the node to which the support record belongs
valid: a Boolean
"""
self.support_master = support_master
self.valid = True
self.union_cache = support_master.union_cache
self.support_id = self.union_cache.SetId([])
def Update(self, set_id):
"""Augment the support record with the set represented by set_id.
"""
union_id = self.union_cache.Union(self.support_id,
set_id)
if union_id != self.support_id:
self.support_master.SupportRecordDependencyAdd(
self.union_cache.Elements(set_id),
self)
self.support_id = union_id
def UpdateSet(self, symbols):
"""Add symbols to the support.
This function is similar to Update, but the argument is a list of elements.
"""
self.Update(self.union_cache.SetId(symbols))
class SupportMaster(object):
"""Record the support records that depend on a given symbol.
A map symbol_to_records is maintained. For each symbol s
self.symbol_to_records[s] is the set of support records r whose support set
contains s."""
def __init__(self):
"""Constructor.
Instance variables:
symbol_to_records: a mapping to record sets
union_cache: a UnionCache for memoizing sets and their unions
"""
self.symbol_to_records = {}
self.union_cache = UnionCache()
def SupportRecordDependencyAdd(self, symbols, support_record):
"""Add dependency of support record on symbol."""
for symbol in symbols:
if symbol not in self.symbol_to_records:
self.symbol_to_records[symbol] = set([])
self.symbol_to_records[symbol].add(support_record)
def InvalidateRecords(self, symbol):
"""Mark as invalid all support records whose set contains symbol."""
if symbol in self.symbol_to_records:
for support_record in self.symbol_to_records[symbol]:
support_record.valid = False
class IncludeAnalyzerMemoizingNode(include_analyzer.IncludeAnalyzer):
"""A memoizing algorithm for include analysis based on a graph construction.
Instance variables:
master_cache: a two-level node cache
The key of the top-level cache is an include configuration of the form
(currdir_idx, quote_dirs, angle_dirs)
The value of the top-level cache is a node-cache (defined next).
The key of the second-level (node) cache has the form
(filepath_idx, resolution_mode, file_dir_idx)
A node is the value of the second-level (node) cache. It has the form
[filepath_real_idx, filepath_resolved_pair, [node_0, ...node_n-1],
support_record]
where each node_i, for 1 <= i < n, is a node representing a direct include
dependency of node. The direct include dependencies are the edges of a
directed graph called the summary graph.
TODO(csilvers): document what the values of the node-cache mean.
In this class, the top-level key is referred to as 'incl_config', the
top-level value is referred to as 'node_cache_for_incl_config', the
second-level key is referred to as 'key', and the second-level value is
referred to as 'node'.
There are many disjoint summary graphs, one for each include configuration.
Each node of a summary graph is the image of a key, that is, there are values
incl_config and key such that node == master_cache[incl_config][key].
As stated the node cache works pre-resolution. But it may well be that, say,
two occurrences of #include "foo.h" in files with different file directories
(that is, the files containing the foo.h includes are in different
directories) actually resolve to the same foo.h file. In that case, we should
reuse the foo.h node -- with a catch: all though the file may be the same
real file, their containing directories may be different. For example, the
file may be in the real location /D/foo.h, but it may also be known as
/E/foo.h, where E is a directory containing a symbolic link foo.h pointing to
/D/foo.h. If file foo.h has a quoted include of bar.h, that is, contains the
directive
#include "bar.h"
then bar.h is looked for in /D if the file directory is /D, but it is looked
for in /E if the file directory is /E. That is the real file directory of
/E/foo.h is *not* the directory component of the realpath of /E/foo.h.
Rather, it is the realpath of the directory component of /E/foo.h, that is,
the realpath of /E.
Thus, if we memoize files according to their real location, then the file
directory as understood above must also be taken into account.
In particular, we also use as keys pairs of the form:
(realpath index of resolved file, real path index of filedir).
This realpath-oriented memoization is not a frivolous attempt at optimization.
It is essential to avoiding infinite loops as in:
D/mem.h
D/../D/mem.h
D/../D/../D/mem.h
generated by an include of the form "#include ../D/mem.h" in file mem.h.
One would think that obviosly these prefixes denote the same location. But
they need not! For D of the first line could be a symbolic link to a real
directory dir1_D. And, the second D could be another symbolic link in
dir1_D/ to dir2_D, etc...
So, when the include processor is lead astray by includes that resolve this
way it is by no means obvious how to investigate the paths with symbolic links
of the form
(D/..)*
This will diverge even if there is just one mem.h file with an include of
../D/mem.h started in the real directory D. [Remember that the include
processor does not heed include guards.]
For termination, we rely on the fact that eventually the pair (realpath of
file, real path of file directory) will be seen again (because there are
finitely many files and directories). In practice, with this technique, the
recursion is stopped at the second attempt to include mem.h.
"""
SUPPORT_RECORD = 3
def _InitializeAllCachesMemoizing(self):
self.master_cache = {}
self.support_master = SupportMaster()
self.parse_file.SetDefineCallback(self.support_master.InvalidateRecords)
def __init__(self, client_root_keeper, stat_reset_triggers={}):
"""Constructor."""
include_analyzer.IncludeAnalyzer.__init__(self,
client_root_keeper,
stat_reset_triggers)
self._InitializeAllCachesMemoizing()
def ClearStatCaches(self):
"""Reset stat caches and the node cache, which depends on stat caches."""
include_analyzer.IncludeAnalyzer.ClearStatCaches(self)
self._InitializeAllCachesMemoizing()
def _PrintableFilePath(self, fp):
return (isinstance(fp, int) and self.includepath_map.String(fp)
or isinstance(fp, tuple) and
(self.directory_map.string[fp[0]],
self.includepath_map.string[fp[1]]))
def RunAlgorithm(self, filepath_resolved_pair, filepath_real_idx):
"""See RunAlgorithm of class IncludeAnalyzer in include_analyzer."""
incl_config = (self.currdir_idx, self.quote_dirs, self.angle_dirs)
try:
nodes_for_incl_config = self.master_cache[incl_config]
except KeyError:
nodes_for_incl_config = self.master_cache[incl_config] = {}
for d_opt in self.d_opts:
if len(d_opt) == 1:
lhs, rhs = d_opt[0], "1"
elif len(d_opt) == 2:
[lhs, rhs] = d_opt
parse_file.InsertMacroDefInTable(lhs, rhs, self.symbol_table,
self.support_master.InvalidateRecords)
else:
pass
node = self.FindNode(nodes_for_incl_config,
filepath_resolved_pair,
RESOLVED,
None,
filepath_real_idx)
include_closure = {}
self._CalculateIncludeClosureExceptSystem(node, include_closure)
return include_closure
def FindNode(self,
nodes_for_incl_config,
fp,
resolution_mode,
file_dir_idx=None,
fp_real_idx=None):
"""Find a previously constructed node or create a new node.
Arguments:
nodes_for_incl_config: a dictionary (see class documentation).
fp: a filepath index or, if resolution_mode == RESOLVED, a filepath pair
resolution_mode: an integer in RESOLUTION_MODES
file_dir_idx: consider the file F that has the line '#include "fp"'
which is causing us to call FindNode on fp. file_dir_idx is the
index of dirname(F). (This argument affects the semantics of
resolution for resolution_mode == QUOTE, and is also used to resolve
subframework includes.)
fp_real_idx: the realpath index of resolved filepath
(Useful for resolution_mode == RESOLVED only.)
Returns:
a node or None
Raises:
NotCoveredError
This is function is long, too long. But function calls are
expensive in Python. TODO(klarlund): refactor.
"""
dir_map = self.directory_map
includepath_map = self.includepath_map
resolve = self.build_stat_cache.Resolve
assert isinstance(nodes_for_incl_config, dict)
assert (not self.IsFilepathPair(fp)
or resolution_mode == RESOLVED)
assert (not fp
or (self.IsFilepathPair(fp)
or (resolution_mode != RESOLVED
and self.IsIncludepathIndex(fp))))
assert resolution_mode in RESOLUTION_MODES
assert not resolution_mode == QUOTE or file_dir_idx
assert not fp_real_idx or resolution_mode == RESOLVED
if __debug__:
Debug(DEBUG_TRACE,
"FindNode: fp: %s, mode: %s\n file_dir: %s,\n fp_real: %s" %
(self._PrintableFilePath(fp),
RESOLUTION_MODES_STR[resolution_mode],
not file_dir_idx and " " or dir_map.string[file_dir_idx],
not fp_real_idx and " "
or self.realpath_map.string[fp_real_idx]))
statistics.find_node_counter += 1
if fp == None: return
key = (fp, resolution_mode, file_dir_idx)
if key in nodes_for_incl_config:
if nodes_for_incl_config[key][self.SUPPORT_RECORD].valid:
statistics.master_hit_counter += 1
return nodes_for_incl_config[key]
else:
node = nodes_for_incl_config[key]
currdir_idx = self.currdir_idx
quote_dirs = self.quote_dirs
angle_dirs = self.angle_dirs
[fp_real_idx, fp_resolved_pair, _, support_record] = node
Debug(DEBUG_TRACE,
"Invalid record for translation unit: %s, file: %s",
self.translation_unit, self._PrintableFilePath(fp))
else:
support_record = SupportRecord(self.support_master)
currdir_idx = self.currdir_idx
quote_dirs = self.quote_dirs
angle_dirs = self.angle_dirs
if resolution_mode == QUOTE:
(fp_resolved_pair, fp_real_idx) = (
resolve(fp, currdir_idx, file_dir_idx, quote_dirs, file_dir_idx))
elif resolution_mode == ANGLE:
(fp_resolved_pair, fp_real_idx) = (
resolve(fp, currdir_idx, None, angle_dirs, file_dir_idx))
elif resolution_mode == NEXT:
fp_resolved_pair = None
fp_real_idx = None
else:
assert resolution_mode == RESOLVED
assert fp_real_idx assert self.IsFilepathPair(fp)
fp_resolved_pair = fp
if fp_resolved_pair:
(d_, fp_) = fp_resolved_pair
if (fp_resolved_pair, currdir_idx) not in self.mirrored:
self.mirrored.add((fp_resolved_pair, currdir_idx))
self.mirror_path.DoPath(
os.path.join(dir_map.string[currdir_idx],
basics.PathFromDirMapEntryAndInclude(
dir_map.string[d_],
includepath_map.string[fp_])),
currdir_idx,
self.client_root_keeper.client_root)
assert not fp_resolved_pair or fp_real_idx
assert not fp_real_idx or fp_resolved_pair
children = []
node = (fp_real_idx, fp_resolved_pair, children,
support_record)
nodes_for_incl_config[key] = node
if not fp_resolved_pair:
if resolution_mode == NEXT:
for d in quote_dirs:
(fp_resolved_pair_, fp_real_idx_) = (
resolve(fp, currdir_idx, None, (d,), file_dir_idx))
if fp_resolved_pair_ != None:
node_ = self.FindNode(nodes_for_incl_config,
fp_resolved_pair_,
RESOLVED,
None, fp_real_idx_)
children.append(node_)
return node
else:
return node
assert (fp and fp_real_idx and fp_resolved_pair)
(searchdir_idx, includepath_idx) = fp_resolved_pair
try:
(fp_dirname_idx, fp_dirname_real_idx) = (
self.dirname_cache.cache[(currdir_idx,
searchdir_idx,
includepath_idx)])
except KeyError:
(fp_dirname_idx, fp_dirname_real_idx) = (
self.dirname_cache.Lookup(currdir_idx,
searchdir_idx,
includepath_idx))
if resolution_mode != RESOLVED:
if ((fp_real_idx, fp_dirname_real_idx) in nodes_for_incl_config
and support_record.valid):
statistics.master_hit_counter += 1
node = nodes_for_incl_config[(fp_real_idx, fp_dirname_real_idx)]
nodes_for_incl_config[key] = node
return node
nodes_for_incl_config[(fp_real_idx, fp_dirname_real_idx)] = node
statistics.master_miss_counter += 1
support_record.valid = True
try:
(quote_includes, angle_includes, expr_includes, next_includes) = (
self.file_cache[fp_real_idx])
except KeyError:
self.file_cache[fp_real_idx] = self.parse_file.Parse(
self.realpath_map.string[fp_real_idx],
self.symbol_table)
(quote_includes, angle_includes, expr_includes, next_includes) = (
self.file_cache[fp_real_idx])
if self.systemdir_prefix_cache.StartsWithSystemdir(fp_real_idx, self.realpath_map):
return node
for quote_filepath in quote_includes:
node_ = self.FindNode(nodes_for_incl_config, quote_filepath, QUOTE,
fp_dirname_idx)
if node_:
children.append(node_)
support_record.Update(node_[self.SUPPORT_RECORD].support_id)
for angle_filepath in angle_includes:
node_ = self.FindNode(nodes_for_incl_config, angle_filepath, ANGLE,
fp_dirname_idx)
if node_:
children.append(node_)
support_record.Update(node_[self.SUPPORT_RECORD].support_id)
if __debug__:
if expr_includes: Debug(DEBUG_DATA, "FindNode, expr_includes: file: %s: '%s'",
(isinstance(fp, int) and includepath_map.String(fp)
or (isinstance(fp, tuple) and
(dir_map.string[fp[0]],
includepath_map.string[fp[1]]))),
expr_includes)
for expr in expr_includes:
(files, symbols) = (
macro_eval.ResolveExpr(includepath_map.Index,
resolve,
expr,
self.currdir_idx, fp_dirname_idx,
self.quote_dirs, self.angle_dirs,
self.symbol_table))
for (fp_resolved_pair_, fp_real_idx_) in files:
node_ = self.FindNode(nodes_for_incl_config,
fp_resolved_pair_,
RESOLVED, None, fp_real_idx_)
if node_:
children.append(node_)
support_record.Update(node_[self.SUPPORT_RECORD].support_id)
support_record.UpdateSet(symbols)
for include_next_filepath in next_includes:
node_ = self.FindNode(nodes_for_incl_config, include_next_filepath, NEXT,
fp_dirname_idx)
if node_:
children.append(node_)
support_record.Update(node_[self.SUPPORT_RECORD].support_id)
return node
def _CalculateIncludeClosureExceptSystem(self, node, include_closure):
"""Explore the subgraph reachable from node and gather real paths.
Arguments:
node: the node of the translation unit, the initial source file
(see class documentation for a description of this tuple).
include_closure: a map (see IncludeAnalyzer.RunAlgorithm
documentation for a description of this map).
Modifies:
include_closure. We modify in place to avoid copying this big struct.
"""
assert not include_closure self.systemdir_prefix_cache.FillCache(self.realpath_map)
visited = set([])
starts_with_systemdir = self.systemdir_prefix_cache.cache
dir_map_string = self.directory_map.string
if not node: return
stack = ([node]) if __debug__: statistics.len_calculated_closure_nonsys = 0
while stack:
node = stack.pop(-1)
id_ = id(node)
if id_ in visited:
continue
visited.add(id_)
if node[0]: if __debug__: statistics.len_calculated_closure_nonsys += 1
if not starts_with_systemdir[node[0]]:
if node[0] not in include_closure:
include_closure[node[0]] = []
searchdir_idx = node[1][0] if (searchdir_idx and dir_map_string[searchdir_idx] and
dir_map_string[searchdir_idx][0] == '/'):
include_closure[node[0]].append(node[1])
stack.extend(node[2])
statistics.len_calculated_closure = len(include_closure)