tools/leak_finder.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

# Copyright (C) 2022 The Qt Company Ltd.
# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0

"""
leak_finder.py
==============

This script finds memory leaks in Python.

Usage:
------

Place one or more lines which should be tested for leaks in a loop:

    from leak_finder import LeakFinder
    ...
    lf = LeakFinder()
    for i in range(1000):
        leaking_statement()
    lf.find_leak()


Theory
------

How to find a leak?

We repeatedly perform an action and observe if that has an unexpected
side effect. There are typically two observations:

* one object is growing its refcount (a pseudo-leak)
* we get many new objects of one type (a true leak)

A difficulty in trying to get leak info is avoiding side effects
of the measurement. Early attempts with lists of refcounts were
unsuccessful. Using array.array for counting refcounts avoids that.


Algorithm
---------
We record a snapshot of all objects in a list and a parallel array
of refcounts.

Then we do some computation and do the same snapshot again.

The structure of a list of all objects is extending at the front for
some reason. That makes the captured structures easy to compare.
We reverse that list and array and have for the objects:

    len(all2) >= len(all1)

    all1[idx] == all2[idx] for idx in range(len(all1))

When taking the second snapshot, the objects still have references from
the first snapshot.
For objects with no effect, the following relation is true:

    refs1[idx] == refs2[idx] - 1 for idx in range(len(all1))

All other objects are potential pseudo-leaks, because they waste
references but no objects in the first place.

Then we look at the newly created objects:
These objects are real leaks if their number is growing with the probe
size. For analysis, the number of new objects per type is counted.
"""

import array
import gc
import sys
import unittest
# this comes from Python, too
from test import support

try:
    sys.getobjects
    have_debug = True
except AttributeError:
    have_debug = False


class LeakFinder(object):
    def __init__(self):
        self.all, self.refs = self._make_snapshot()

    @staticmethod
    def _make_snapshot():
        gc.collect()
        # get all objects
        all = sys.getobjects(0)
        # get an array with the refcounts
        g = sys.getrefcount
        refs = array.array("l", (g(obj) for obj in all))
        # the lists have the same endind. Make comparison easier.
        all.reverse()
        refs.reverse()
        return all, refs

    @staticmethod
    def _short_repr(x, limit=76):
        s = repr(x)
        if len(s) > limit:
            s = s[:limit] + "..."
        return s

    def find_leak(self):
        all1 = self.all
        refs1 = self.refs
        del self.all, self.refs
        all2, refs2 = self._make_snapshot()
        common = len(all1)
        del all1

        srepr = self._short_repr
        # look into existing objects for increased refcounts
        first = True
        for idx in range(common):
            ref = refs2[idx] - refs1[idx] - 1
            if abs(ref) <= 10:
                continue
            obj = all2[idx]
            if first:
                print()
            first = False
            print(f"Fake Leak ref={ref} obj={srepr(obj)}")

        # look at the extra objects by type size
        types = {}
        for idx in range(common, len(all2)):
            obj = all2[idx]
            typ = type(obj)
            if typ not in types:
                types[typ] = []
            types[typ].append(obj)
        first = True
        for typ in types:
            oblis = types[typ]
            ref = len(oblis)
            if ref <= 10:
                continue
            try:
                oblis.sort()
            except TypeError:
                pass
            if first:
                print()
            first = False
            left, mid, right = oblis[0], oblis[ref // 2], oblis[-1]
            print(f"True Leak ref={ref} typ={typ} left={left} mid={mid} right={right}")


class TestDemo(unittest.TestCase):

    @unittest.skipUnless(have_debug, 'You need a debug build with "--with-trace-refs"')
    def test_demo(self):
        # create a pseudo leak and a true leak
        fake_leak_obj = []
        true_leak_obj = []
        lf = LeakFinder()
        refs_before = sys.gettotalrefcount()
        for idx in range(100):
            fake_leak_obj.append("same string")
            true_leak_obj.append(idx + 1000)    # avoiding cached low numbers
        refs_after = sys.gettotalrefcount()
        lf.find_leak()
        self.assertNotAlmostEqual(refs_after - refs_before, 0, delta=10)


if __name__ == "__main__":
    unittest.main()