Changes in / [09f3093:2d9e75d]
- Files:
-
- 1 added
- 1 deleted
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
Makefile
r510af01 r3c9fc94 2 2 CXXFLAGS = -std=gnu++0x -O3 -g 3 3 4 CPPFLAGS = 4 # When compiling with CXX=powerpc64-bgq-linux-g++, we need these: 5 CPPFLAGS = -I/bgsys/drivers/ppcfloor -I/bgsys/drivers/ppcfloor/spi/include/kernel/cnk 6 5 7 LDFLAGS = -lpthread -ldl 6 8 … … 16 18 $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -fPIC -shared -o libmemlog.so memlog.cpp 17 19 18 install: all memlog2dot README 19 cp -a libmemlog.so memlog_s.o memlog2dot README $(DESTDIR)/ 20 install: all memlog_analyze README 21 cp -a libmemlog.so memlog_s.o memlog_analyze README $(DESTDIR)/ 22 echo '-Wl,--wrap,malloc,--wrap,valloc,--wrap,realloc,--wrap,calloc,--wrap,memalign,--wrap,free,--wrap,posix_memalign,--wrap,mmap,--wrap,mmap64,--wrap,munmap $(DESTDIR)/memlog_s.o -lpthread -ldl' > $(DESTDIR)/memlog_s_ld_cmds 20 23 21 24 clean: -
README
r24aa734 r3c9fc94 16 16 -L/path/to/memlog -Wl,-rpath,/path/to/memlog -lmemlog 17 17 18 For statically-linked applications, add the following to your linker flags: 18 For statically-linked applications ld's automatic wrapping functionality is 19 employed, and the exact set of necessary flags is large, so a file named 20 memlog_s_ld_cmds has been provided containing the necessary flags. 19 21 20 -Wl,--wrap,malloc,--wrap,free,--wrap,realloc,--wrap,calloc,--wrap,memalign \ 21 /path/to/memlog/memlog_s.o -lpthread -ldl 22 To your linker flags add: 23 24 `cat /path/to/memlog/memlog_s_ld_cmds` 25 26 or, if your compiler and wrappers support response files (gcc and clang do, for 27 example), simply: 28 29 @/path/to/memlog/memlog_s_ld_cmds 30 31 so your overall linking command might look something like this: 32 33 mpic++ -O3 -g -o my_program my_obj1.o my_obj2.o @/path/to/memlog/memlog_s_ld_cmds 22 34 23 35 ** RUNNING ** … … 29 41 use by running: 30 42 31 /path/to/memlog/memlog 2dot/path/to/HOST.PID.memlog43 /path/to/memlog/memlog_analyze /path/to/HOST.PID.memlog 32 44 33 45 this will generate files named HOST.PID.memlog.dot, HOST.PID.memlog.ps and … … 36 48 in textual form. 37 49 50 If you pass the --leaks option to memlog_analyze, it will provide data on 51 allocations active at the end of the program (leaks) instead of those active 52 when the peak memory usage is first reached. 53 54 You might have many runs of the same application (or output from many ranks of 55 an MPI job), and you'd like to pick the one for analysis with the highest 56 memory usage. If you provide a glob pattern to memlog_analyze it will do this 57 for you. Make sure you quote the glob pattern so that your shell does not 58 expand it. 59 60 /path/to/memlog/memlog_analyze "/path/to/*.memlog" 61 62 When running under common batch systems, the files are named 63 JOB_ID.HOST.PID.memlog, and when running under the BG/Q CNK, the process's rank 64 is used instead of the node-local PID. 65 38 66 Note that te peak memory usage is determined by monitoring the processes's 39 67 maximum resident set size, not just the total allocated heap memory. 40 68 41 memlog2dot depends on dot (from the graphviz package) and ps2pdf (from the 69 memlog_analyze takes, as a second optional parameter, the name of the output 70 directory (the current directory is the default). If the directory does not 71 exist, it will be created. 72 73 memlog_analyze depends on dot (from the graphviz package) and ps2pdf (from the 42 74 ghostscript package), plus various tools from the binutils package. 43 75 -
memlog.cpp
r09f3093 r2d9e75d 45 45 #include <cstdio> 46 46 #include <cstring> 47 #include <cstdint> 47 48 48 49 // NOTE: This source makes very minimal use of C++11 features. It can still be … … 52 53 53 54 #include <limits.h> 55 #include <errno.h> 54 56 #include <malloc.h> 55 57 #include <execinfo.h> 58 #include <sys/mman.h> 56 59 #include <sys/syscall.h> 57 60 #include <sys/time.h> … … 66 69 #include <dlfcn.h> 67 70 71 #ifdef __bgq__ 72 #include <spi/include/kernel/location.h> 73 #include <spi/include/kernel/memory.h> 74 #endif 75 68 76 using namespace std; 69 77 … … 72 80 // -Wl,--wrap,malloc,--wrap,free,--wrap,realloc,--wrap,calloc,--wrap,memalign /path/to/memlog_s.o -lpthread -ldl 73 81 74 FILE *log_file = NULL;82 static FILE *log_file = 0; 75 83 static pthread_mutex_t log_mutex = PTHREAD_MUTEX_INITIALIZER; 76 84 … … 80 88 static char self_path[PATH_MAX+1] = { '\0' }; 81 89 90 #ifdef __bgq__ 91 static int on_bgq = 0; 92 #endif 93 94 static void *initial_brk = 0; 95 96 static unordered_map<void *, Dl_info> *dladdr_cache = 0; 97 82 98 __attribute__((__constructor__)) 83 99 static void record_init() { … … 85 101 uname(&u); 86 102 103 int id = (int) getpid(); 104 #ifdef __bgq__ 105 // If we're really running on a BG/Q compute node, use the job rank instead 106 // of the pid because the node name might not really be globally unique. 107 if (!strcmp(u.sysname, "CNK") && !strcmp(u.machine, "BGQ")) { 108 id = (int) Kernel_GetRank(); 109 on_bgq = 1; 110 } 111 #endif 112 113 // If we're running under a common batch system, add the job id to the output 114 // file names (add it as a prefix so that sorting the files will sort by job 115 // first). 116 char *job_id = 0; 117 const char *job_id_vars[] = 118 { "COBALT_JOBID", "PBS_JOBID", "SLURM_JOB_ID", "JOB_ID" }; 119 for (int i = 0; i < sizeof(job_id_vars)/sizeof(job_id_vars[0]); ++i) { 120 job_id = getenv(job_id_vars[i]); 121 if (job_id) 122 break; 123 } 124 87 125 char log_name[PATH_MAX+1]; 88 snprintf(log_name, PATH_MAX+1, "%s.%d.memlog", u.nodename, getpid()); 126 if (job_id) 127 snprintf(log_name, PATH_MAX+1, "%s.%s.%d.memlog", job_id, u.nodename, id); 128 else 129 snprintf(log_name, PATH_MAX+1, "%s.%d.memlog", u.nodename, id); 89 130 log_file = fopen(log_name, "w"); 90 131 if (!log_file) … … 93 134 const char *link_name = "/proc/self/exe"; 94 135 readlink(link_name, self_path, PATH_MAX); 136 137 initial_brk = sbrk(0); 95 138 } 96 139 … … 110 153 (void) fflush(log_file); 111 154 (void) fclose(log_file); 155 156 if (dladdr_cache) 157 delete dladdr_cache; 112 158 } 113 159 … … 115 161 // we need to cache the lookup results. 116 162 static int dladdr_cached(void * addr, Dl_info *info) { 117 static unordered_map<void *, Dl_info> dladdr_cache; 118 119 auto I = dladdr_cache.find(addr); 120 if (I == dladdr_cache.end()) { 163 if (!dladdr_cache) 164 dladdr_cache = new unordered_map<void *, Dl_info>; 165 166 auto I = dladdr_cache->find(addr); 167 if (I == dladdr_cache->end()) { 121 168 int r; 122 169 if (!(r = dladdr(addr, info))) 123 170 memset(info, 0, sizeof(Dl_info)); 124 171 125 dladdr_cache .insert(make_pair(addr, *info));172 dladdr_cache->insert(make_pair(addr, *info)); 126 173 return r; 127 174 } … … 140 187 fprintf(log_file, "\t%ld.%06ld %ld %ld", usage.ru_utime.tv_sec, 141 188 usage.ru_utime.tv_usec, usage.ru_maxrss, syscall(SYS_gettid)); 189 190 // Some other memory stats (like with maxrss, report these in KB). 191 size_t arena_size = ((size_t) sbrk(0)) - (size_t) initial_brk; 192 193 uint64_t mmap_size = 0; 194 #ifdef __bgq__ 195 if (on_bgq) 196 (void) Kernel_GetMemorySize(KERNEL_MEMSIZE_MMAP, &mmap_size); 197 #endif 198 199 fprintf(log_file, " %ld %ld", arena_size >> 10, mmap_size >> 10); 142 200 143 201 if (!show_backtrace) … … 233 291 } 234 292 293 #ifdef __PIC__ 294 static int (*__real_posix_memalign)(void **memptr, size_t alignment, 295 size_t size) = 0; 296 297 static void *(*__real_mmap)(void *addr, size_t length, int prot, int flags, 298 int fd, off_t offset) = 0; 299 static void *(*__real_mmap64)(void *addr, size_t length, int prot, int flags, 300 int fd, off64_t offset) = 0; 301 static int (*__real_munmap)(void *addr, size_t length) = 0; 302 #else 303 extern "C" { 304 extern int __real_posix_memalign(void **memptr, size_t alignment, size_t size); 305 306 extern void *__real_mmap(void *addr, size_t length, int prot, int flags, 307 int fd, off_t offset); 308 extern void *__real_mmap64(void *addr, size_t length, int prot, int flags, 309 int fd, off64_t offset); 310 extern int __real_munmap(void *addr, size_t length); 311 } 312 #endif 313 235 314 // glibc exports its underlying malloc implementation under the name 236 315 // __libc_malloc so that hooks like this can use it. 237 316 extern "C" { 238 317 extern void *__libc_malloc(size_t size); 318 extern void *__libc_valloc(size_t size); 239 319 extern void *__libc_realloc(void *ptr, size_t size); 240 320 extern void *__libc_calloc(size_t nmemb, size_t size); … … 258 338 259 339 void *ptr = __libc_malloc(size); 260 261 record_malloc(size, ptr, caller); 340 if (ptr) 341 record_malloc(size, ptr, caller); 342 343 in_malloc = 0; 344 return ptr; 345 } 346 347 void *FUNC(valloc)(size_t size) { 348 const void *caller = 349 __builtin_extract_return_addr(__builtin_return_address(0)); 350 351 if (in_malloc) 352 return __libc_valloc(size); 353 354 in_malloc = 1; 355 356 void *ptr = __libc_valloc(size); 357 if (ptr) 358 record_malloc(size, ptr, caller); 262 359 263 360 in_malloc = 0; … … 278 375 if (ptr) 279 376 record_free(ptr, caller); 280 record_malloc(size, nptr, caller); 377 if (nptr) 378 record_malloc(size, nptr, caller); 281 379 282 380 in_malloc = 0; … … 296 394 void *ptr = __libc_calloc(nmemb, size); 297 395 298 record_malloc(nmemb*size, ptr, caller); 396 if (ptr) 397 record_malloc(nmemb*size, ptr, caller); 299 398 300 399 in_malloc = 0; … … 314 413 void *ptr = __libc_memalign(boundary, size); 315 414 316 record_malloc(size, ptr, caller); 415 if (ptr) 416 record_malloc(size, ptr, caller); 317 417 318 418 in_malloc = 0; … … 337 437 } 338 438 439 int FUNC(posix_memalign)(void **memptr, size_t alignment, size_t size) { 440 const void *caller = 441 __builtin_extract_return_addr(__builtin_return_address(0)); 442 443 #ifdef __PIC__ 444 if (!__real_posix_memalign) 445 if (!(*(void **) (&__real_posix_memalign) = 446 dlsym(RTLD_NEXT, "posix_memalign"))) { 447 return ELIBACC; 448 } 449 #endif 450 451 if (in_malloc) 452 return __real_posix_memalign(memptr, alignment, size); 453 454 in_malloc = 1; 455 456 int r = __real_posix_memalign(memptr, alignment, size); 457 458 if (!r) 459 record_malloc(size, *memptr, caller); 460 461 in_malloc = 0; 462 463 return r; 464 } 465 466 void *FUNC(mmap)(void *addr, size_t length, int prot, int flags, 467 int fd, off_t offset) { 468 const void *caller = 469 __builtin_extract_return_addr(__builtin_return_address(0)); 470 471 #ifdef __PIC__ 472 if (!__real_mmap) 473 if (!(*(void **) (&__real_mmap) = dlsym(RTLD_NEXT, "mmap"))) { 474 errno = ELIBACC; 475 return MAP_FAILED; 476 } 477 #endif 478 479 if (in_malloc) 480 return __real_mmap(addr, length, prot, flags, fd, offset); 481 482 in_malloc = 1; 483 484 void *ptr = __real_mmap(addr, length, prot, flags, fd, offset); 485 486 if (ptr != MAP_FAILED) 487 record_malloc(length, ptr, caller); 488 489 in_malloc = 0; 490 491 return ptr; 492 } 493 494 void *FUNC(mmap64)(void *addr, size_t length, int prot, int flags, 495 int fd, off64_t offset) { 496 const void *caller = 497 __builtin_extract_return_addr(__builtin_return_address(0)); 498 499 #ifdef __PIC__ 500 if (!__real_mmap64) 501 if (!(*(void **) (&__real_mmap64) = dlsym(RTLD_NEXT, "mmap64"))) { 502 errno = ELIBACC; 503 return MAP_FAILED; 504 } 505 #endif 506 507 if (in_malloc) 508 return __real_mmap64(addr, length, prot, flags, fd, offset); 509 510 in_malloc = 1; 511 512 void *ptr = __real_mmap64(addr, length, prot, flags, fd, offset); 513 514 if (ptr != MAP_FAILED) 515 record_malloc(length, ptr, caller); 516 517 in_malloc = 0; 518 519 return ptr; 520 } 521 522 int FUNC(munmap)(void *addr, size_t length) { 523 const void *caller = 524 __builtin_extract_return_addr(__builtin_return_address(0)); 525 526 #ifdef __PIC__ 527 if (!__real_munmap) 528 if (!(*(void **) (&__real_munmap) = dlsym(RTLD_NEXT, "munmap"))) { 529 errno = ELIBACC; 530 return -1; 531 } 532 #endif 533 534 if (in_malloc) 535 return __real_munmap(addr, length); 536 537 in_malloc = 1; 538 539 record_free(addr, caller); 540 541 int r = __real_munmap(addr, length); 542 543 in_malloc = 0; 544 545 return r; 546 } 547 339 548 } // extern "C" 340 549
Note: See TracChangeset
for help on using the changeset viewer.