1 files changed, 64 insertions, 4 deletions
diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
index f2e74d18..0eb02c34 100644
--- a/libdw/libdw_alloc.c
+++ b/libdw/libdw_alloc.c
@@ -35,7 +35,67 @@
 #include <stdlib.h>
 #include "libdwP.h"
 #include "system.h"
+#include "atomics.h"
+#if USE_VG_ANNOTATIONS == 1
+#include <helgrind.h>
+#else
+#define ANNOTATE_HAPPENS_BEFORE(X)
+#define ANNOTATE_HAPPENS_AFTER(X)
+#endif
 
+#define THREAD_ID_UNSET ((size_t) -1)
+static __thread size_t thread_id = THREAD_ID_UNSET;
+static atomic_size_t next_id = ATOMIC_VAR_INIT(0);
+
+struct libdw_memblock *
+__libdw_alloc_tail (Dwarf *dbg)
+{
+  if (thread_id == THREAD_ID_UNSET)
+    thread_id = atomic_fetch_add (&next_id, 1);
+
+  pthread_rwlock_rdlock (&dbg->mem_rwl);
+  if (thread_id >= dbg->mem_stacks)
+    {
+      pthread_rwlock_unlock (&dbg->mem_rwl);
+      pthread_rwlock_wrlock (&dbg->mem_rwl);
+
+      /* Another thread may have already reallocated. In theory using an
+         atomic would be faster, but given that this only happens once per
+         thread per Dwarf, some minor slowdown should be fine.  */
+      if (thread_id >= dbg->mem_stacks)
+        {
+          dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
+                                    * sizeof (struct libdw_memblock *));
+          if (dbg->mem_tails == NULL)
+            {
+              pthread_rwlock_unlock (&dbg->mem_rwl);
+              dbg->oom_handler();
+            }
+          for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
+            dbg->mem_tails[i] = NULL;
+          dbg->mem_stacks = thread_id + 1;
+          ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
+        }
+
+      pthread_rwlock_unlock (&dbg->mem_rwl);
+      pthread_rwlock_rdlock (&dbg->mem_rwl);
+    }
+
+  /* At this point, we have an entry in the tail array.  */
+  ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
+  struct libdw_memblock *result = dbg->mem_tails[thread_id];
+  if (result == NULL)
+    {
+      result = malloc (dbg->mem_default_size);
+      result->size = dbg->mem_default_size
+                     - offsetof (struct libdw_memblock, mem);
+      result->remaining = result->size;
+      result->prev = NULL;
+      dbg->mem_tails[thread_id] = result;
+    }
+  pthread_rwlock_unlock (&dbg->mem_rwl);
+  return result;
+}
 
 void *
 __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
@@ -52,10 +112,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
   newp->size = size - offsetof (struct libdw_memblock, mem);
   newp->remaining = (uintptr_t) newp + size - (result + minsize);
 
-  newp->prev = (struct libdw_memblock*)atomic_exchange_explicit(
-      &dbg->mem_tail, (uintptr_t)newp, memory_order_relaxed);
-  if (pthread_setspecific (dbg->mem_key, newp) != 0)
-    dbg->oom_handler ();
+  pthread_rwlock_rdlock (&dbg->mem_rwl);
+  newp->prev = dbg->mem_tails[thread_id];
+  dbg->mem_tails[thread_id] = newp;
+  pthread_rwlock_unlock (&dbg->mem_rwl);
 
   return (void *) result;
 }