diff --git a/benchmark b/benchmark
new file mode 100755
index 0000000000000000000000000000000000000000..a73799e6c19b7ab5e290194f5fb891ad881e6801
Binary files /dev/null and b/benchmark differ
diff --git a/events.0.edf b/events.0.edf
new file mode 100644
index 0000000000000000000000000000000000000000..5a3bdf42f2cf8b4d262b067818c3be5738cb2b4d
--- /dev/null
+++ b/events.0.edf
@@ -0,0 +1,135 @@
+134 dynamic_trace_events
+# FunctionId Group Tag "Name Type" Parameters
+0 TAUEVENT 0 ".TAU <unknown event>" TriggerValue
+1 TAU_DEFAULT 0 ".TAU application " EntryExit
+12 TAU_DEFAULT 0 "taupreload_main " EntryExit
+13 TAU_USER 0 "cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C " EntryExit
+14 TAU_USER 0 "cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C " EntryExit
+15 TAU_USER 0 "cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C " EntryExit
+16 TAU_USER 0 "cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C " EntryExit
+17 TAU_USER 0 "cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C " EntryExit
+18 TAU_USER 0 "cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C " EntryExit
+19 TAU_USER 0 "cl_int clRetainCommandQueue(cl_command_queue) C " EntryExit
+20 TAU_USER 0 "cl_int clRetainContext(cl_context) C " EntryExit
+21 TAU_USER 0 "cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C " EntryExit
+22 TAU_USER 0 "cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C " EntryExit
+23 TAU_USER 0 "cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C " EntryExit
+24 TAU_USER 0 "cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C " EntryExit
+25 TAU_USER 0 "cl_int clReleaseContext(cl_context) C " EntryExit
+26 TAU_USER 0 "cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C " EntryExit
+27 TAU_USER 0 "cl_int clReleaseProgram(cl_program) C " EntryExit
+28 TAU_USER 0 "cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C " EntryExit
+29 TAU_USER 0 "cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C " EntryExit
+30 TAU_USER 0 "cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C " EntryExit
+33 TAU_USER 0 "WriteBuffer " EntryExit
+35 TAU_USER 0 "cl_int clReleaseCommandQueue(cl_command_queue) C " EntryExit
+36 TAU_USER 0 "cl_int clRetainKernel(cl_kernel) C " EntryExit
+37 TAU_USER 0 "cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C " EntryExit
+38 TAU_USER 0 "cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C " EntryExit
+39 TAU_USER 0 "cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C " EntryExit
+40 TAU_USER 0 "cl_int clReleaseKernel(cl_kernel) C " EntryExit
+41 TAU_USER 0 "cl_int clFinish(cl_command_queue) C " EntryExit
+42 TAU_USER 0 "fill " EntryExit
+43 TAU_USER 0 "matrix_mul " EntryExit
+2 TAUEVENT 0 "Bytes copied from Host to Device" TriggerValue
+3 TAUEVENT 0 "Bytes copied from Device to Host" TriggerValue
+4 TAUEVENT 0 "Bytes copied from Device to Device" TriggerValue
+5 TAUEVENT 0 "Correlation ID" TriggerValue
+6 TAUEVENT 0 "Unified Memory Bytes copied from Host to Device" TriggerValue
+7 TAUEVENT 0 "Unified Memory Bytes copied from Device to Host" TriggerValue
+8 TAUEVENT 0 "Unified Memory CPU Page Faults" TriggerValue
+9 TAUEVENT 0 "Floating Point Operations" TriggerValue
+10 TAUEVENT 0 "Memory Operations" TriggerValue
+11 TAUEVENT 0 "Control Operations" TriggerValue
+31 TAUEVENT 0 "Time in Queue (us)" TriggerValue
+32 TAUEVENT 0 "Time in Submitted (us)" TriggerValue
+34 TAUEVENT 0 "Correlation ID : WriteBuffer" TriggerValue
+44 TAUEVENT 0 "CPU Cores | 16" TriggerValue
+45 TAUEVENT 0 "CPU MHz | 3599.999" TriggerValue
+46 TAUEVENT 0 "CPU Type | Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz" TriggerValue
+47 TAUEVENT 0 "CPU Vendor | GenuineIntel" TriggerValue
+48 TAUEVENT 0 "CPUs Allowed | 00000001,00000001" TriggerValue
+49 TAUEVENT 0 "CPUs Allowed List | 0,32" TriggerValue
+50 TAUEVENT 0 "CWD | /auto/home/users/j/h/jhano/P3" TriggerValue
+51 TAUEVENT 0 "Cache Size | 36864 KB" TriggerValue
+52 TAUEVENT 0 "Command Line | ./main" TriggerValue
+53 TAUEVENT 0 "Executable | /auto/home/users/j/h/jhano/P3/main" TriggerValue
+54 TAUEVENT 0 "Hostname | mb-icg102.cism.ucl.ac.be" TriggerValue
+55 TAUEVENT 0 "Local Time | 2025-05-20T09:39:54+02:00" TriggerValue
+56 TAUEVENT 0 "Memories Allowed | 00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000003" TriggerValue
+57 TAUEVENT 0 "Memories Allowed List | 0-1" TriggerValue
+58 TAUEVENT 0 "Memory Size | 263747860 kB" TriggerValue
+59 TAUEVENT 0 "Node Name | mb-icg102.cism.ucl.ac.be" TriggerValue
+60 TAUEVENT 0 "OS Machine | x86_64" TriggerValue
+61 TAUEVENT 0 "OS Name | Linux" TriggerValue
+62 TAUEVENT 0 "OS Release | 5.4.286-1.el8.elrepo.x86_64" TriggerValue
+63 TAUEVENT 0 "OS Version | #1 SMP Sun Nov 17 11:28:26 EST 2024" TriggerValue
+64 TAUEVENT 0 "Starting Timestamp | 1747726794996778" TriggerValue
+65 TAUEVENT 0 "TAU Architecture | default" TriggerValue
+66 TAUEVENT 0 "TAU Config |  -opencl=/opt/sw/arch/easybuild/2023b/software/CUDA/12.1.1/targets/x86_64-linux/" TriggerValue
+67 TAUEVENT 0 "TAU Makefile | /opt/sw/noarch/manual/2023b/softwares/TAU2/tau2/x86_64/lib/Makefile.tau" TriggerValue
+68 TAUEVENT 0 "TAU Version | 2.34-git" TriggerValue
+69 TAUEVENT 0 "TAU_BFD_LOOKUP | on" TriggerValue
+70 TAUEVENT 0 "TAU_CALLPATH_DEPTH | 2" TriggerValue
+71 TAUEVENT 0 "TAU_CALLSITE_DEPTH | 1" TriggerValue
+72 TAUEVENT 0 "TAU_CUDA_BINARY_EXE | " TriggerValue
+73 TAUEVENT 0 "TAU_CUPTI_API | runtime" TriggerValue
+74 TAUEVENT 0 "TAU_CUPTI_PC | off" TriggerValue
+75 TAUEVENT 0 "TAU_CURRENT_TIMER_EXIT_PARAMS | off" TriggerValue
+76 TAUEVENT 0 "TAU_EBS_KEEP_UNRESOLVED_ADDR | off" TriggerValue
+77 TAUEVENT 0 "TAU_IBM_BG_HWP_COUNTERS | off" TriggerValue
+78 TAUEVENT 0 "TAU_MEASURE_TAU | off" TriggerValue
+79 TAUEVENT 0 "TAU_MEMDBG_PROTECT_ABOVE | off" TriggerValue
+80 TAUEVENT 0 "TAU_MEMDBG_PROTECT_BELOW | off" TriggerValue
+81 TAUEVENT 0 "TAU_MEMDBG_PROTECT_FREE | off" TriggerValue
+82 TAUEVENT 0 "TAU_MEMMGR_MAX_BLOCKS | 64" TriggerValue
+83 TAUEVENT 0 "TAU_OPENMP_RUNTIME | on" TriggerValue
+84 TAUEVENT 0 "TAU_OPENMP_RUNTIME_EVENTS | on" TriggerValue
+85 TAUEVENT 0 "TAU_OPENMP_RUNTIME_STATES | off" TriggerValue
+86 TAUEVENT 0 "TAU_OUTPUT_CUDA_CSV | off" TriggerValue
+87 TAUEVENT 0 "TAU_PAPI_MULTIPLEXING | off" TriggerValue
+88 TAUEVENT 0 "TAU_PROFILE | off" TriggerValue
+89 TAUEVENT 0 "TAU_PROFILE_FORMAT | profile" TriggerValue
+90 TAUEVENT 0 "TAU_RECYCLE_THREADS | off" TriggerValue
+91 TAUEVENT 0 "TAU_REGION_ADDRESSES | off" TriggerValue
+92 TAUEVENT 0 "TAU_SAMPLING | off" TriggerValue
+93 TAUEVENT 0 "TAU_SHOW_MEMORY_FUNCTIONS | off" TriggerValue
+94 TAUEVENT 0 "TAU_SIGNALS_GDB | off" TriggerValue
+95 TAUEVENT 0 "TAU_SYNCHRONIZE_CLOCKS | off" TriggerValue
+96 TAUEVENT 0 "TAU_THROTTLE | on" TriggerValue
+97 TAUEVENT 0 "TAU_THROTTLE_NUMCALLS | 100000" TriggerValue
+98 TAUEVENT 0 "TAU_THROTTLE_PERCALL | 10" TriggerValue
+99 TAUEVENT 0 "TAU_TRACE | on" TriggerValue
+100 TAUEVENT 0 "TAU_TRACE_FORMAT | tau" TriggerValue
+101 TAUEVENT 0 "TAU_TRACK_CUDA_CDP | off" TriggerValue
+102 TAUEVENT 0 "TAU_TRACK_CUDA_ENV | off" TriggerValue
+103 TAUEVENT 0 "TAU_TRACK_CUDA_INSTRUCTIONS | " TriggerValue
+104 TAUEVENT 0 "TAU_TRACK_CUDA_SASS | off" TriggerValue
+105 TAUEVENT 0 "TAU_TRACK_HEADROOM | off" TriggerValue
+106 TAUEVENT 0 "TAU_TRACK_HEAP | off" TriggerValue
+107 TAUEVENT 0 "TAU_TRACK_IO_PARAMS | off" TriggerValue
+108 TAUEVENT 0 "TAU_TRACK_MEMORY_FOOTPRINT | off" TriggerValue
+109 TAUEVENT 0 "TAU_TRACK_MEMORY_LEAKS | off" TriggerValue
+110 TAUEVENT 0 "TAU_TRACK_SIGNALS | off" TriggerValue
+111 TAUEVENT 0 "TAU_TRACK_UNIFIED_MEMORY | off" TriggerValue
+112 TAUEVENT 0 "TAU_VERBOSE_RANK | -1" TriggerValue
+113 TAUEVENT 0 "Timestamp | 1747726794996859" TriggerValue
+114 TAUEVENT 0 "UTC Time | 2025-05-20T07:39:54Z" TriggerValue
+115 TAUEVENT 0 "pid | 2856919" TriggerValue
+116 TAUEVENT 0 "tid | 2856919" TriggerValue
+117 TAUEVENT 0 "username | jhano" TriggerValue
+60000 TRACER 0 "EV_INIT" none
+60001 TRACER 0 "FLUSH" EntryExit
+60003 TRACER 0 "FLUSH_CLOSE" none
+60004 TRACER 0 "FLUSH_INITM" none
+60005 TRACER 0 "WALL_CLOCK" none
+60006 TRACER 0 "CONT_EVENT" none
+60007 TAU_MESSAGE -7 "MESSAGE_SEND" par
+60008 TAU_MESSAGE -8 "MESSAGE_RECV" par
+70000 TAUEVENT 0 "ONESIDED_MESSAGE_SEND" TriggerValue
+70001 TAUEVENT 0 "ONESIDED_MESSAGE_RECV" TriggerValue
+70005 TAUEVENT 0 "ONESIDED_MESSAGE_RECIPROCAL_SEND" TriggerValue
+70006 TAUEVENT 0 "ONESIDED_MESSAGE_RECIPROCAL_RECV" TriggerValue
+70004 TAUEVENT 0 "ONESIDED_MESSAGE" TriggerValue
+70002 TAUEVENT 0 "ONESIDED_MESSAGE_ID_TriggerValueT1" TriggerValue
+70003 TAUEVENT 0 "ONESIDED_MESSAGE_ID_TriggerValueT2" TriggerValue
diff --git a/main b/main
old mode 100644
new mode 100755
index 841b3a3a9ba9d36d0501888218945734f692e8b7..cebc82616920ce1b2b3071bba03e9d01829c6ac9
Binary files a/main and b/main differ
diff --git a/profile.0.0.0 b/profile.0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..d2a5e35e580edf498781ae1f4718016c3102f26f
--- /dev/null
+++ b/profile.0.0.0
@@ -0,0 +1,33 @@
+27 templated_functions_MULTI_TAUGPU_TIME
+# Name Calls Subrs Excl Incl ProfileCalls # <metadata><attribute><name>Metric Name</name><value>TAUGPU_TIME</value></attribute><attribute><name>CPU Cores</name><value>16</value></attribute><attribute><name>CPU MHz</name><value>3578.336</value></attribute><attribute><name>CPU Type</name><value>Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz</value></attribute><attribute><name>CPU Vendor</name><value>GenuineIntel</value></attribute><attribute><name>CPUs Allowed</name><value>00020000,00020000</value></attribute><attribute><name>CPUs Allowed List</name><value>17,49</value></attribute><attribute><name>CWD</name><value>/auto/home/users/j/h/jhano/P3</value></attribute><attribute><name>Cache Size</name><value>36864 KB</value></attribute><attribute><name>Command Line</name><value>./main</value></attribute><attribute><name>Ending Timestamp</name><value>1747646983158546</value></attribute><attribute><name>Executable</name><value>/auto/home/users/j/h/jhano/P3/main</value></attribute><attribute><name>Hostname</name><value>mb-icg102.cism.ucl.ac.be</value></attribute><attribute><name>Local Time</name><value>2025-05-19T11:29:41+02:00</value></attribute><attribute><name>Memories Allowed</name><value>00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000003</value></attribute><attribute><name>Memories Allowed List</name><value>0-1</value></attribute><attribute><name>Memory Size</name><value>263747860 kB</value></attribute><attribute><name>Node Name</name><value>mb-icg102.cism.ucl.ac.be</value></attribute><attribute><name>OS Machine</name><value>x86_64</value></attribute><attribute><name>OS Name</name><value>Linux</value></attribute><attribute><name>OS Release</name><value>5.4.286-1.el8.elrepo.x86_64</value></attribute><attribute><name>OS Version</name><value>#1 SMP Sun Nov 17 11:28:26 EST 2024</value></attribute><attribute><name>Starting Timestamp</name><value>1747646981315961</value></attribute><attribute><name>TAU Architecture</name><value>default</value></attribute><attribute><name>TAU Config</name><value> -opencl=/opt/sw/arch/easybuild/2023b/software/CUDA/12.1.1/targets/x86_64-linux/</value></attribute><attribute><name>TAU Makefile</name><value>/opt/sw/noarch/manual/2023b/softwares/TAU2/tau2/x86_64/lib/Makefile.tau</value></attribute><attribute><name>TAU Version</name><value>2.34-git</value></attribute><attribute><name>TAU_BFD_LOOKUP</name><value>on</value></attribute><attribute><name>TAU_CALLPATH</name><value>off</value></attribute><attribute><name>TAU_CALLPATH_DEPTH</name><value>2</value></attribute><attribute><name>TAU_CALLSITE_DEPTH</name><value>1</value></attribute><attribute><name>TAU_COMPENSATE</name><value>off</value></attribute><attribute><name>TAU_CUDA_BINARY_EXE</name><value></value></attribute><attribute><name>TAU_CUPTI_API</name><value>runtime</value></attribute><attribute><name>TAU_CUPTI_PC</name><value>off</value></attribute><attribute><name>TAU_CURRENT_TIMER_EXIT_PARAMS</name><value>on</value></attribute><attribute><name>TAU_EBS_KEEP_UNRESOLVED_ADDR</name><value>off</value></attribute><attribute><name>TAU_ENABLE_THREAD_CONTEXT</name><value>off</value></attribute><attribute><name>TAU_IBM_BG_HWP_COUNTERS</name><value>off</value></attribute><attribute><name>TAU_MEASURE_TAU</name><value>off</value></attribute><attribute><name>TAU_MEMDBG_PROTECT_ABOVE</name><value>off</value></attribute><attribute><name>TAU_MEMDBG_PROTECT_BELOW</name><value>off</value></attribute><attribute><name>TAU_MEMDBG_PROTECT_FREE</name><value>off</value></attribute><attribute><name>TAU_MEMMGR_MAX_BLOCKS</name><value>64</value></attribute><attribute><name>TAU_OPENMP_RUNTIME</name><value>on</value></attribute><attribute><name>TAU_OPENMP_RUNTIME_EVENTS</name><value>on</value></attribute><attribute><name>TAU_OPENMP_RUNTIME_STATES</name><value>off</value></attribute><attribute><name>TAU_OUTPUT_CUDA_CSV</name><value>off</value></attribute><attribute><name>TAU_PAPI_MULTIPLEXING</name><value>off</value></attribute><attribute><name>TAU_PROFILE</name><value>on</value></attribute><attribute><name>TAU_PROFILE_FORMAT</name><value>profile</value></attribute><attribute><name>TAU_RECYCLE_THREADS</name><value>off</value></attribute><attribute><name>TAU_REGION_ADDRESSES</name><value>off</value></attribute><attribute><name>TAU_SAMPLING</name><value>off</value></attribute><attribute><name>TAU_SHOW_MEMORY_FUNCTIONS</name><value>off</value></attribute><attribute><name>TAU_SIGNALS_GDB</name><value>off</value></attribute><attribute><name>TAU_THROTTLE</name><value>on</value></attribute><attribute><name>TAU_THROTTLE_NUMCALLS</name><value>100000</value></attribute><attribute><name>TAU_THROTTLE_PERCALL</name><value>10</value></attribute><attribute><name>TAU_TRACE</name><value>off</value></attribute><attribute><name>TAU_TRACE_FORMAT</name><value>tau</value></attribute><attribute><name>TAU_TRACK_CUDA_CDP</name><value>off</value></attribute><attribute><name>TAU_TRACK_CUDA_ENV</name><value>off</value></attribute><attribute><name>TAU_TRACK_CUDA_INSTRUCTIONS</name><value></value></attribute><attribute><name>TAU_TRACK_CUDA_SASS</name><value>off</value></attribute><attribute><name>TAU_TRACK_HEADROOM</name><value>off</value></attribute><attribute><name>TAU_TRACK_HEAP</name><value>off</value></attribute><attribute><name>TAU_TRACK_IO_PARAMS</name><value>off</value></attribute><attribute><name>TAU_TRACK_MEMORY_FOOTPRINT</name><value>off</value></attribute><attribute><name>TAU_TRACK_MEMORY_LEAKS</name><value>off</value></attribute><attribute><name>TAU_TRACK_SIGNALS</name><value>off</value></attribute><attribute><name>TAU_TRACK_UNIFIED_MEMORY</name><value>off</value></attribute><attribute><name>TAU_VERBOSE_RANK</name><value>-1</value></attribute><attribute><name>Timestamp</name><value>1747646981316029</value></attribute><attribute><name>UTC Time</name><value>2025-05-19T09:29:41Z</value></attribute><attribute><name>pid</name><value>2612340</value></attribute><attribute><name>tid</name><value>2612340</value></attribute><attribute><name>username</name><value>jhano</value></attribute><attribute><name>Ending Timestamp</name><value>1747646983158546</value></attribute></metadata>
+".TAU application" 1 1 12295 1843253 0 GROUP="TAU_DEFAULT" 
+"taupreload_main" 1 158 726369 1830958 0 GROUP="TAU_DEFAULT" 
+"cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C" 2 0 66232 66232 0 GROUP="TAU_USER" 
+"cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C" 2 0 2 2 0 GROUP="TAU_USER" 
+"cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C" 2 0 1 1 0 GROUP="TAU_USER" 
+"cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C" 4 0 31 31 0 GROUP="TAU_USER" 
+"cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C" 1 0 107413 107413 0 GROUP="TAU_USER" 
+"cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C" 1 0 7 7 0 GROUP="TAU_USER" 
+"cl_int clRetainCommandQueue(cl_command_queue) C" 7 0 10 10 0 GROUP="TAU_USER" 
+"cl_int clRetainContext(cl_context) C" 17 0 7 7 0 GROUP="TAU_USER" 
+"cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C" 9 0 8 8 0 GROUP="TAU_USER" 
+"cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C" 9 0 14619 14619 0 GROUP="TAU_USER" 
+"cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C" 18 0 12 12 0 GROUP="TAU_USER" 
+"cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C" 18 0 7 7 0 GROUP="TAU_USER" 
+"cl_int clReleaseContext(cl_context) C" 18 0 5 5 0 GROUP="TAU_USER" 
+"cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C" 9 0 118 118 0 GROUP="TAU_USER" 
+"cl_int clReleaseProgram(cl_program) C" 9 0 5 5 0 GROUP="TAU_USER" 
+"cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C" 3 0 10 10 0 GROUP="TAU_USER" 
+"cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C" 2 0 11518 11518 0 GROUP="TAU_USER" 
+"cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C" 2 0 1 1 0 GROUP="TAU_USER" 
+"cl_int clReleaseCommandQueue(cl_command_queue) C" 7 0 21 21 0 GROUP="TAU_USER" 
+"cl_int clRetainKernel(cl_kernel) C" 1 0 1 1 0 GROUP="TAU_USER" 
+"cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C" 11 0 2 2 0 GROUP="TAU_USER" 
+"cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C" 2 0 2553 2553 0 GROUP="TAU_USER" 
+"cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C" 2 0 2 2 0 GROUP="TAU_USER" 
+"cl_int clReleaseKernel(cl_kernel) C" 1 0 1 1 0 GROUP="TAU_USER" 
+"cl_int clFinish(cl_command_queue) C" 1 0 902003 902003 0 GROUP="TAU_USER" 
+0 aggregates
+1 userevents
+# eventname numevents max min mean sumsqr
+"Bytes copied from Host to Device" 4 67108864 67108864 67108864 1.801439850948198E+16
diff --git a/profile.0.0.1 b/profile.0.0.1
new file mode 100644
index 0000000000000000000000000000000000000000..3b9aada052f976a61f6f340fd97e9dca099f73ed
--- /dev/null
+++ b/profile.0.0.1
@@ -0,0 +1,7 @@
+4 templated_functions_MULTI_TAUGPU_TIME
+# Name Calls Subrs Excl Incl ProfileCalls # <metadata><attribute><name>Metric Name</name><value>TAUGPU_TIME</value></attribute><attribute><name>CPU Cores</name><value>16</value></attribute><attribute><name>CPU MHz</name><value>3578.336</value></attribute><attribute><name>CPU Type</name><value>Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz</value></attribute><attribute><name>CPU Vendor</name><value>GenuineIntel</value></attribute><attribute><name>CPUs Allowed</name><value>00020000,00020000</value></attribute><attribute><name>CPUs Allowed List</name><value>17,49</value></attribute><attribute><name>CWD</name><value>/auto/home/users/j/h/jhano/P3</value></attribute><attribute><name>Cache Size</name><value>36864 KB</value></attribute><attribute><name>Command Line</name><value>./main</value></attribute><attribute><name>Executable</name><value>/auto/home/users/j/h/jhano/P3/main</value></attribute><attribute><name>Hostname</name><value>mb-icg102.cism.ucl.ac.be</value></attribute><attribute><name>Local Time</name><value>2025-05-19T11:29:41+02:00</value></attribute><attribute><name>Memories Allowed</name><value>00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000003</value></attribute><attribute><name>Memories Allowed List</name><value>0-1</value></attribute><attribute><name>Memory Size</name><value>263747860 kB</value></attribute><attribute><name>Node Name</name><value>mb-icg102.cism.ucl.ac.be</value></attribute><attribute><name>OS Machine</name><value>x86_64</value></attribute><attribute><name>OS Name</name><value>Linux</value></attribute><attribute><name>OS Release</name><value>5.4.286-1.el8.elrepo.x86_64</value></attribute><attribute><name>OS Version</name><value>#1 SMP Sun Nov 17 11:28:26 EST 2024</value></attribute><attribute><name>OpenCL Command Queue</name><value>00</value></attribute><attribute><name>OpenCL Device</name><value>0</value></attribute><attribute><name>Starting Timestamp</name><value>1747646981315961</value></attribute><attribute><name>TAU Architecture</name><value>default</value></attribute><attribute><name>TAU Config</name><value> -opencl=/opt/sw/arch/easybuild/2023b/software/CUDA/12.1.1/targets/x86_64-linux/</value></attribute><attribute><name>TAU Makefile</name><value>/opt/sw/noarch/manual/2023b/softwares/TAU2/tau2/x86_64/lib/Makefile.tau</value></attribute><attribute><name>TAU Version</name><value>2.34-git</value></attribute><attribute><name>TAU_BFD_LOOKUP</name><value>on</value></attribute><attribute><name>TAU_CALLPATH</name><value>off</value></attribute><attribute><name>TAU_CALLPATH_DEPTH</name><value>2</value></attribute><attribute><name>TAU_CALLSITE_DEPTH</name><value>1</value></attribute><attribute><name>TAU_COMPENSATE</name><value>off</value></attribute><attribute><name>TAU_CUDA_BINARY_EXE</name><value></value></attribute><attribute><name>TAU_CUPTI_API</name><value>runtime</value></attribute><attribute><name>TAU_CUPTI_PC</name><value>off</value></attribute><attribute><name>TAU_CURRENT_TIMER_EXIT_PARAMS</name><value>on</value></attribute><attribute><name>TAU_EBS_KEEP_UNRESOLVED_ADDR</name><value>off</value></attribute><attribute><name>TAU_ENABLE_THREAD_CONTEXT</name><value>off</value></attribute><attribute><name>TAU_IBM_BG_HWP_COUNTERS</name><value>off</value></attribute><attribute><name>TAU_MEASURE_TAU</name><value>off</value></attribute><attribute><name>TAU_MEMDBG_PROTECT_ABOVE</name><value>off</value></attribute><attribute><name>TAU_MEMDBG_PROTECT_BELOW</name><value>off</value></attribute><attribute><name>TAU_MEMDBG_PROTECT_FREE</name><value>off</value></attribute><attribute><name>TAU_MEMMGR_MAX_BLOCKS</name><value>64</value></attribute><attribute><name>TAU_OPENMP_RUNTIME</name><value>on</value></attribute><attribute><name>TAU_OPENMP_RUNTIME_EVENTS</name><value>on</value></attribute><attribute><name>TAU_OPENMP_RUNTIME_STATES</name><value>off</value></attribute><attribute><name>TAU_OUTPUT_CUDA_CSV</name><value>off</value></attribute><attribute><name>TAU_PAPI_MULTIPLEXING</name><value>off</value></attribute><attribute><name>TAU_PROFILE</name><value>on</value></attribute><attribute><name>TAU_PROFILE_FORMAT</name><value>profile</value></attribute><attribute><name>TAU_RECYCLE_THREADS</name><value>off</value></attribute><attribute><name>TAU_REGION_ADDRESSES</name><value>off</value></attribute><attribute><name>TAU_SAMPLING</name><value>off</value></attribute><attribute><name>TAU_SHOW_MEMORY_FUNCTIONS</name><value>off</value></attribute><attribute><name>TAU_SIGNALS_GDB</name><value>off</value></attribute><attribute><name>TAU_THROTTLE</name><value>on</value></attribute><attribute><name>TAU_THROTTLE_NUMCALLS</name><value>100000</value></attribute><attribute><name>TAU_THROTTLE_PERCALL</name><value>10</value></attribute><attribute><name>TAU_TRACE</name><value>off</value></attribute><attribute><name>TAU_TRACE_FORMAT</name><value>tau</value></attribute><attribute><name>TAU_TRACK_CUDA_CDP</name><value>off</value></attribute><attribute><name>TAU_TRACK_CUDA_ENV</name><value>off</value></attribute><attribute><name>TAU_TRACK_CUDA_INSTRUCTIONS</name><value></value></attribute><attribute><name>TAU_TRACK_CUDA_SASS</name><value>off</value></attribute><attribute><name>TAU_TRACK_HEADROOM</name><value>off</value></attribute><attribute><name>TAU_TRACK_HEAP</name><value>off</value></attribute><attribute><name>TAU_TRACK_IO_PARAMS</name><value>off</value></attribute><attribute><name>TAU_TRACK_MEMORY_FOOTPRINT</name><value>off</value></attribute><attribute><name>TAU_TRACK_MEMORY_LEAKS</name><value>off</value></attribute><attribute><name>TAU_TRACK_SIGNALS</name><value>off</value></attribute><attribute><name>TAU_TRACK_UNIFIED_MEMORY</name><value>off</value></attribute><attribute><name>TAU_VERBOSE_RANK</name><value>-1</value></attribute><attribute><name>Timestamp</name><value>1747646981316029</value></attribute><attribute><name>UTC Time</name><value>2025-05-19T09:29:41Z</value></attribute><attribute><name>pid</name><value>2612340</value></attribute><attribute><name>tid</name><value>2612340</value></attribute><attribute><name>username</name><value>jhano</value></attribute><attribute><name>Ending Timestamp</name><value>1747646983158546</value></attribute></metadata>
+".TAU application" 1 4 924387 1839791 0 GROUP="TAU_DEFAULT" 
+"WriteBuffer" 2 0 10988 10988 0 GROUP="TAU_USER" 
+"fill" 1 0 143 143 0 GROUP="TAU_USER" 
+"matrix_mul" 1 0 904273 904273 0 GROUP="TAU_USER" 
+0 aggregates
diff --git a/tau.edf b/tau.edf
new file mode 100644
index 0000000000000000000000000000000000000000..2b2076373156425c8945e9793524f13a8e4ffadd
--- /dev/null
+++ b/tau.edf
@@ -0,0 +1,135 @@
+133 dynamic_trace_events
+# FunctionId Group Tag "Name Type" Parameters
+1 TAUEVENT 0 ".TAU <unknown event>" TriggerValue
+2 TAU_DEFAULT 0 ".TAU application " EntryExit
+34 TAUEVENT 0 "Bytes copied from Device to Device" TriggerValue
+33 TAUEVENT 0 "Bytes copied from Device to Host" TriggerValue
+32 TAUEVENT 0 "Bytes copied from Host to Device" TriggerValue
+124 TRACER 0 "CONT_EVENT" none
+45 TAUEVENT 0 "CPU Cores | 16" TriggerValue
+46 TAUEVENT 0 "CPU MHz | 3599.999" TriggerValue
+47 TAUEVENT 0 "CPU Type | Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz" TriggerValue
+48 TAUEVENT 0 "CPU Vendor | GenuineIntel" TriggerValue
+50 TAUEVENT 0 "CPUs Allowed List | 0,32" TriggerValue
+49 TAUEVENT 0 "CPUs Allowed | 00000001,00000001" TriggerValue
+51 TAUEVENT 0 "CWD | /auto/home/users/j/h/jhano/P3" TriggerValue
+52 TAUEVENT 0 "Cache Size | 36864 KB" TriggerValue
+53 TAUEVENT 0 "Command Line | ./main" TriggerValue
+41 TAUEVENT 0 "Control Operations" TriggerValue
+44 TAUEVENT 0 "Correlation ID : WriteBuffer" TriggerValue
+35 TAUEVENT 0 "Correlation ID" TriggerValue
+119 TRACER 0 "EV_INIT" none
+54 TAUEVENT 0 "Executable | /auto/home/users/j/h/jhano/P3/main" TriggerValue
+120 TRACER 0 "FLUSH" EntryExit
+121 TRACER 0 "FLUSH_CLOSE" none
+122 TRACER 0 "FLUSH_INITM" none
+39 TAUEVENT 0 "Floating Point Operations" TriggerValue
+55 TAUEVENT 0 "Hostname | mb-icg102.cism.ucl.ac.be" TriggerValue
+56 TAUEVENT 0 "Local Time | 2025-05-20T09:39:54+02:00" TriggerValue
+126 TAU_MESSAGE -8 "MESSAGE_RECV" par
+125 TAU_MESSAGE -7 "MESSAGE_SEND" par
+58 TAUEVENT 0 "Memories Allowed List | 0-1" TriggerValue
+57 TAUEVENT 0 "Memories Allowed | 00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000003" TriggerValue
+40 TAUEVENT 0 "Memory Operations" TriggerValue
+59 TAUEVENT 0 "Memory Size | 263747860 kB" TriggerValue
+60 TAUEVENT 0 "Node Name | mb-icg102.cism.ucl.ac.be" TriggerValue
+131 TAUEVENT 0 "ONESIDED_MESSAGE" TriggerValue
+132 TAUEVENT 0 "ONESIDED_MESSAGE_ID_TriggerValueT1" TriggerValue
+133 TAUEVENT 0 "ONESIDED_MESSAGE_ID_TriggerValueT2" TriggerValue
+130 TAUEVENT 0 "ONESIDED_MESSAGE_RECIPROCAL_RECV" TriggerValue
+129 TAUEVENT 0 "ONESIDED_MESSAGE_RECIPROCAL_SEND" TriggerValue
+128 TAUEVENT 0 "ONESIDED_MESSAGE_RECV" TriggerValue
+127 TAUEVENT 0 "ONESIDED_MESSAGE_SEND" TriggerValue
+61 TAUEVENT 0 "OS Machine | x86_64" TriggerValue
+62 TAUEVENT 0 "OS Name | Linux" TriggerValue
+63 TAUEVENT 0 "OS Release | 5.4.286-1.el8.elrepo.x86_64" TriggerValue
+64 TAUEVENT 0 "OS Version | #1 SMP Sun Nov 17 11:28:26 EST 2024" TriggerValue
+65 TAUEVENT 0 "Starting Timestamp | 1747726794996778" TriggerValue
+66 TAUEVENT 0 "TAU Architecture | default" TriggerValue
+67 TAUEVENT 0 "TAU Config |  -opencl=/opt/sw/arch/easybuild/2023b/software/CUDA/12.1.1/targets/x86_64-linux/" TriggerValue
+68 TAUEVENT 0 "TAU Makefile | /opt/sw/noarch/manual/2023b/softwares/TAU2/tau2/x86_64/lib/Makefile.tau" TriggerValue
+69 TAUEVENT 0 "TAU Version | 2.34-git" TriggerValue
+70 TAUEVENT 0 "TAU_BFD_LOOKUP | on" TriggerValue
+71 TAUEVENT 0 "TAU_CALLPATH_DEPTH | 2" TriggerValue
+72 TAUEVENT 0 "TAU_CALLSITE_DEPTH | 1" TriggerValue
+73 TAUEVENT 0 "TAU_CUDA_BINARY_EXE | " TriggerValue
+74 TAUEVENT 0 "TAU_CUPTI_API | runtime" TriggerValue
+75 TAUEVENT 0 "TAU_CUPTI_PC | off" TriggerValue
+76 TAUEVENT 0 "TAU_CURRENT_TIMER_EXIT_PARAMS | off" TriggerValue
+77 TAUEVENT 0 "TAU_EBS_KEEP_UNRESOLVED_ADDR | off" TriggerValue
+78 TAUEVENT 0 "TAU_IBM_BG_HWP_COUNTERS | off" TriggerValue
+79 TAUEVENT 0 "TAU_MEASURE_TAU | off" TriggerValue
+80 TAUEVENT 0 "TAU_MEMDBG_PROTECT_ABOVE | off" TriggerValue
+81 TAUEVENT 0 "TAU_MEMDBG_PROTECT_BELOW | off" TriggerValue
+82 TAUEVENT 0 "TAU_MEMDBG_PROTECT_FREE | off" TriggerValue
+83 TAUEVENT 0 "TAU_MEMMGR_MAX_BLOCKS | 64" TriggerValue
+84 TAUEVENT 0 "TAU_OPENMP_RUNTIME | on" TriggerValue
+85 TAUEVENT 0 "TAU_OPENMP_RUNTIME_EVENTS | on" TriggerValue
+86 TAUEVENT 0 "TAU_OPENMP_RUNTIME_STATES | off" TriggerValue
+87 TAUEVENT 0 "TAU_OUTPUT_CUDA_CSV | off" TriggerValue
+88 TAUEVENT 0 "TAU_PAPI_MULTIPLEXING | off" TriggerValue
+89 TAUEVENT 0 "TAU_PROFILE | off" TriggerValue
+90 TAUEVENT 0 "TAU_PROFILE_FORMAT | profile" TriggerValue
+91 TAUEVENT 0 "TAU_RECYCLE_THREADS | off" TriggerValue
+92 TAUEVENT 0 "TAU_REGION_ADDRESSES | off" TriggerValue
+93 TAUEVENT 0 "TAU_SAMPLING | off" TriggerValue
+94 TAUEVENT 0 "TAU_SHOW_MEMORY_FUNCTIONS | off" TriggerValue
+95 TAUEVENT 0 "TAU_SIGNALS_GDB | off" TriggerValue
+96 TAUEVENT 0 "TAU_SYNCHRONIZE_CLOCKS | off" TriggerValue
+97 TAUEVENT 0 "TAU_THROTTLE | on" TriggerValue
+98 TAUEVENT 0 "TAU_THROTTLE_NUMCALLS | 100000" TriggerValue
+99 TAUEVENT 0 "TAU_THROTTLE_PERCALL | 10" TriggerValue
+100 TAUEVENT 0 "TAU_TRACE | on" TriggerValue
+101 TAUEVENT 0 "TAU_TRACE_FORMAT | tau" TriggerValue
+102 TAUEVENT 0 "TAU_TRACK_CUDA_CDP | off" TriggerValue
+103 TAUEVENT 0 "TAU_TRACK_CUDA_ENV | off" TriggerValue
+104 TAUEVENT 0 "TAU_TRACK_CUDA_INSTRUCTIONS | " TriggerValue
+105 TAUEVENT 0 "TAU_TRACK_CUDA_SASS | off" TriggerValue
+106 TAUEVENT 0 "TAU_TRACK_HEADROOM | off" TriggerValue
+107 TAUEVENT 0 "TAU_TRACK_HEAP | off" TriggerValue
+108 TAUEVENT 0 "TAU_TRACK_IO_PARAMS | off" TriggerValue
+109 TAUEVENT 0 "TAU_TRACK_MEMORY_FOOTPRINT | off" TriggerValue
+110 TAUEVENT 0 "TAU_TRACK_MEMORY_LEAKS | off" TriggerValue
+111 TAUEVENT 0 "TAU_TRACK_SIGNALS | off" TriggerValue
+112 TAUEVENT 0 "TAU_TRACK_UNIFIED_MEMORY | off" TriggerValue
+113 TAUEVENT 0 "TAU_VERBOSE_RANK | -1" TriggerValue
+42 TAUEVENT 0 "Time in Queue (us)" TriggerValue
+43 TAUEVENT 0 "Time in Submitted (us)" TriggerValue
+114 TAUEVENT 0 "Timestamp | 1747726794996859" TriggerValue
+115 TAUEVENT 0 "UTC Time | 2025-05-20T07:39:54Z" TriggerValue
+37 TAUEVENT 0 "Unified Memory Bytes copied from Device to Host" TriggerValue
+36 TAUEVENT 0 "Unified Memory Bytes copied from Host to Device" TriggerValue
+38 TAUEVENT 0 "Unified Memory CPU Page Faults" TriggerValue
+123 TRACER 0 "WALL_CLOCK" none
+22 TAU_USER 0 "WriteBuffer " EntryExit
+9 TAU_USER 0 "cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C " EntryExit
+8 TAU_USER 0 "cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C " EntryExit
+13 TAU_USER 0 "cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C " EntryExit
+26 TAU_USER 0 "cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C " EntryExit
+20 TAU_USER 0 "cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C " EntryExit
+29 TAU_USER 0 "cl_int clFinish(cl_command_queue) C " EntryExit
+21 TAU_USER 0 "cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C " EntryExit
+6 TAU_USER 0 "cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C " EntryExit
+7 TAU_USER 0 "cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C " EntryExit
+27 TAU_USER 0 "cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C " EntryExit
+4 TAU_USER 0 "cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C " EntryExit
+5 TAU_USER 0 "cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C " EntryExit
+15 TAU_USER 0 "cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C " EntryExit
+14 TAU_USER 0 "cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C " EntryExit
+23 TAU_USER 0 "cl_int clReleaseCommandQueue(cl_command_queue) C " EntryExit
+16 TAU_USER 0 "cl_int clReleaseContext(cl_context) C " EntryExit
+28 TAU_USER 0 "cl_int clReleaseKernel(cl_kernel) C " EntryExit
+18 TAU_USER 0 "cl_int clReleaseProgram(cl_program) C " EntryExit
+10 TAU_USER 0 "cl_int clRetainCommandQueue(cl_command_queue) C " EntryExit
+11 TAU_USER 0 "cl_int clRetainContext(cl_context) C " EntryExit
+24 TAU_USER 0 "cl_int clRetainKernel(cl_kernel) C " EntryExit
+25 TAU_USER 0 "cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C " EntryExit
+17 TAU_USER 0 "cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C " EntryExit
+19 TAU_USER 0 "cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C " EntryExit
+12 TAU_USER 0 "cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C " EntryExit
+30 TAU_USER 0 "fill " EntryExit
+31 TAU_USER 0 "matrix_mul " EntryExit
+116 TAUEVENT 0 "pid | 2856919" TriggerValue
+3 TAU_DEFAULT 0 "taupreload_main " EntryExit
+117 TAUEVENT 0 "tid | 2856919" TriggerValue
+118 TAUEVENT 0 "username | jhano" TriggerValue
diff --git a/tau.slog2 b/tau.slog2
new file mode 100644
index 0000000000000000000000000000000000000000..a5b6e9fc9999c64042d4ed0dcc27b1de9dc07b10
Binary files /dev/null and b/tau.slog2 differ
diff --git a/tau.trc b/tau.trc
new file mode 100644
index 0000000000000000000000000000000000000000..3ad2cfa7ed53b86ec3e11a1d45b11394605ffe4b
Binary files /dev/null and b/tau.trc differ
diff --git a/tau_profile_fast.txt b/tau_profile_fast.txt
new file mode 100644
index 0000000000000000000000000000000000000000..24730fba9f83d71f530231207ee5ef831860a6f7
--- /dev/null
+++ b/tau_profile_fast.txt
@@ -0,0 +1,73 @@
+Reading Profile files in profile.*
+
+FUNCTION SUMMARY (total):
+---------------------------------------------------------------------------------------
+%Time    Exclusive    Inclusive       #Call      #Subrs  Inclusive Name
+              msec   total msec                          usec/call 
+---------------------------------------------------------------------------------------
+100.0          936        3,683           2           5    1841522 .TAU application
+ 49.7          726        1,830           1         158    1830958 taupreload_main
+ 24.6          904          904           1           0     904273 matrix_mul
+ 24.5          902          902           1           0     902003 cl_int clFinish(cl_command_queue) C
+  2.9          107          107           1           0     107413 cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C
+  1.8           66           66           2           0      33116 cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C
+  0.4           14           14           9           0       1624 cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C
+  0.3           11           11           2           0       5759 cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C
+  0.3           10           10           2           0       5494 WriteBuffer
+  0.1            2            2           2           0       1276 cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C
+  0.0        0.143        0.143           1           0        143 fill
+  0.0        0.118        0.118           9           0         13 cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C
+  0.0        0.031        0.031           4           0          8 cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C
+  0.0        0.021        0.021           7           0          3 cl_int clReleaseCommandQueue(cl_command_queue) C
+  0.0        0.012        0.012          18           0          1 cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C
+  0.0         0.01         0.01           7           0          1 cl_int clRetainCommandQueue(cl_command_queue) C
+  0.0         0.01         0.01           3           0          3 cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C
+  0.0        0.008        0.008           9           0          1 cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C
+  0.0        0.007        0.007           1           0          7 cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C
+  0.0        0.007        0.007          18           0          0 cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C
+  0.0        0.007        0.007          17           0          0 cl_int clRetainContext(cl_context) C
+  0.0        0.005        0.005          18           0          0 cl_int clReleaseContext(cl_context) C
+  0.0        0.005        0.005           9           0          1 cl_int clReleaseProgram(cl_program) C
+  0.0        0.002        0.002           2           0          1 cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C
+  0.0        0.002        0.002           2           0          1 cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C
+  0.0        0.002        0.002          11           0          0 cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C
+  0.0        0.001        0.001           2           0          0 cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001           2           0          0 cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C
+  0.0        0.001        0.001           1           0          1 cl_int clReleaseKernel(cl_kernel) C
+  0.0        0.001        0.001           1           0          1 cl_int clRetainKernel(cl_kernel) C
+
+FUNCTION SUMMARY (mean):
+---------------------------------------------------------------------------------------
+%Time    Exclusive    Inclusive       #Call      #Subrs  Inclusive Name
+              msec   total msec                          usec/call 
+---------------------------------------------------------------------------------------
+100.0          468        1,841           1         2.5    1841522 .TAU application
+ 49.7          363          915         0.5          79    1830958 taupreload_main
+ 24.6          452          452         0.5           0     904273 matrix_mul
+ 24.5          451          451         0.5           0     902003 cl_int clFinish(cl_command_queue) C
+  2.9           53           53         0.5           0     107413 cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C
+  1.8           33           33           1           0      33116 cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C
+  0.4            7            7         4.5           0       1624 cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C
+  0.3            5            5           1           0       5759 cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C
+  0.3            5            5           1           0       5494 WriteBuffer
+  0.1            1            1           1           0       1276 cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C
+  0.0       0.0715       0.0715         0.5           0        143 fill
+  0.0        0.059        0.059         4.5           0         13 cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C
+  0.0       0.0155       0.0155           2           0          8 cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C
+  0.0       0.0105       0.0105         3.5           0          3 cl_int clReleaseCommandQueue(cl_command_queue) C
+  0.0        0.006        0.006           9           0          1 cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C
+  0.0        0.005        0.005         3.5           0          1 cl_int clRetainCommandQueue(cl_command_queue) C
+  0.0        0.005        0.005         1.5           0          3 cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C
+  0.0        0.004        0.004         4.5           0          1 cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C
+  0.0       0.0035       0.0035         0.5           0          7 cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C
+  0.0       0.0035       0.0035           9           0          0 cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C
+  0.0       0.0035       0.0035         8.5           0          0 cl_int clRetainContext(cl_context) C
+  0.0       0.0025       0.0025           9           0          0 cl_int clReleaseContext(cl_context) C
+  0.0       0.0025       0.0025         4.5           0          1 cl_int clReleaseProgram(cl_program) C
+  0.0        0.001        0.001           1           0          1 cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001           1           0          1 cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001         5.5           0          0 cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C
+  0.0       0.0005       0.0005           1           0          0 cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C
+  0.0       0.0005       0.0005           1           0          0 cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C
+  0.0       0.0005       0.0005         0.5           0          1 cl_int clReleaseKernel(cl_kernel) C
+  0.0       0.0005       0.0005         0.5           0          1 cl_int clRetainKernel(cl_kernel) C
diff --git a/tau_profile_naive.txt b/tau_profile_naive.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9da6b28a87745e385321ff11e01e53a6dd0fc099
--- /dev/null
+++ b/tau_profile_naive.txt
@@ -0,0 +1,73 @@
+Reading Profile files in profile.*
+
+FUNCTION SUMMARY (total):
+---------------------------------------------------------------------------------------
+%Time    Exclusive    Inclusive       #Call      #Subrs  Inclusive Name
+              msec   total msec                          usec/call 
+---------------------------------------------------------------------------------------
+100.0        1,005       24,133           2           5   12066881 .TAU application
+ 50.0          725       12,055           1         157   12055839 taupreload_main
+ 45.8       11,061       11,061           1           0   11061448 cl_int clFinish(cl_command_queue) C
+ 45.8       11,061       11,061           1           0   11061306 matrix_mul
+  0.5          126          126           2           0      63131 cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C
+  0.5          115          115           1           0     115365 cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C
+  0.1           15           15           9           0       1723 cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C
+  0.0           11           11           2           0       5755 cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C
+  0.0           11           11           2           0       5503 WriteBuffer
+  0.0         0.19         0.19           2           0         95 cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C
+  0.0        0.143        0.143           1           0        143 fill
+  0.0        0.123        0.123           9           0         14 cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C
+  0.0        0.028        0.028           4           0          7 cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C
+  0.0        0.024        0.024           7           0          3 cl_int clReleaseCommandQueue(cl_command_queue) C
+  0.0         0.01         0.01          18           0          1 cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C
+  0.0        0.009        0.009           3           0          3 cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C
+  0.0        0.008        0.008          17           0          0 cl_int clRetainContext(cl_context) C
+  0.0        0.006        0.006           1           0          6 cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C
+  0.0        0.006        0.006          18           0          0 cl_int clReleaseContext(cl_context) C
+  0.0        0.006        0.006           9           0          1 cl_int clReleaseProgram(cl_program) C
+  0.0        0.006        0.006           7           0          1 cl_int clRetainCommandQueue(cl_command_queue) C
+  0.0        0.006        0.006           9           0          1 cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C
+  0.0        0.003        0.003          10           0          0 cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C
+  0.0        0.002        0.002           2           0          1 cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001           2           0          0 cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C
+  0.0        0.001        0.001           2           0          0 cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001           2           0          0 cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001          18           0          0 cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001           1           0          1 cl_int clReleaseKernel(cl_kernel) C
+  0.0        0.001        0.001           1           0          1 cl_int clRetainKernel(cl_kernel) C
+
+FUNCTION SUMMARY (mean):
+---------------------------------------------------------------------------------------
+%Time    Exclusive    Inclusive       #Call      #Subrs  Inclusive Name
+              msec   total msec                          usec/call 
+---------------------------------------------------------------------------------------
+100.0          502       12,066           1         2.5   12066881 .TAU application
+ 50.0          362        6,027         0.5        78.5   12055839 taupreload_main
+ 45.8        5,530        5,530         0.5           0   11061448 cl_int clFinish(cl_command_queue) C
+ 45.8        5,530        5,530         0.5           0   11061306 matrix_mul
+  0.5           63           63           1           0      63131 cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C
+  0.5           57           57         0.5           0     115365 cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C
+  0.1            7            7         4.5           0       1723 cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C
+  0.0            5            5           1           0       5755 cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C
+  0.0            5            5           1           0       5503 WriteBuffer
+  0.0        0.095        0.095           1           0         95 cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C
+  0.0       0.0716       0.0716         0.5           0        143 fill
+  0.0       0.0615       0.0615         4.5           0         14 cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C
+  0.0        0.014        0.014           2           0          7 cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C
+  0.0        0.012        0.012         3.5           0          3 cl_int clReleaseCommandQueue(cl_command_queue) C
+  0.0        0.005        0.005           9           0          1 cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C
+  0.0       0.0045       0.0045         1.5           0          3 cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C
+  0.0        0.004        0.004         8.5           0          0 cl_int clRetainContext(cl_context) C
+  0.0        0.003        0.003         0.5           0          6 cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C
+  0.0        0.003        0.003           9           0          0 cl_int clReleaseContext(cl_context) C
+  0.0        0.003        0.003         4.5           0          1 cl_int clReleaseProgram(cl_program) C
+  0.0        0.003        0.003         3.5           0          1 cl_int clRetainCommandQueue(cl_command_queue) C
+  0.0        0.003        0.003         4.5           0          1 cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C
+  0.0       0.0015       0.0015           5           0          0 cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C
+  0.0        0.001        0.001           1           0          1 cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C
+  0.0       0.0005       0.0005           1           0          0 cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C
+  0.0       0.0005       0.0005           1           0          0 cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C
+  0.0       0.0005       0.0005           1           0          0 cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C
+  0.0       0.0005       0.0005           9           0          0 cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C
+  0.0       0.0005       0.0005         0.5           0          1 cl_int clReleaseKernel(cl_kernel) C
+  0.0       0.0005       0.0005         0.5           0          1 cl_int clRetainKernel(cl_kernel) C
diff --git a/tau_summary.txt b/tau_summary.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7139307646817455a58467c71361a0de183baa4c
--- /dev/null
+++ b/tau_summary.txt
@@ -0,0 +1,77 @@
+Reading Profile files in profile.*
+
+FUNCTION SUMMARY (total):
+---------------------------------------------------------------------------------------
+%Time    Exclusive    Inclusive       #Call      #Subrs  Inclusive Name
+              msec   total msec                          usec/call 
+---------------------------------------------------------------------------------------
+100.0          272       21,994           2           7   10997020 .TAU application
+ 49.9       10,741       10,985           1         179   10985515 taupreload_main
+ 48.8       10,735       10,735           1           0   10735516 matrix_mul
+  0.5          114          114           1           0     114368 cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C
+  0.5          112          112           2           0      56124 cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C
+  0.1           13           13           9           0       1541 cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C
+  0.0            2            2           3           0        801 cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C
+  0.0            1            1           2           0        508 cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C
+  0.0        0.672        0.672           2           0        336 WriteBuffer
+  0.0        0.127        0.127           1           0        127 cl_int clEnqueueCopyBuffer(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *) C
+  0.0        0.119        0.119           9           0         13 cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C
+  0.0       0.0275       0.0275           2           0         14 fill
+  0.0       0.0243       0.0243           1           0         24 CopyBuffer
+  0.0        0.024        0.024          10           0          2 cl_int clReleaseCommandQueue(cl_command_queue) C
+  0.0        0.018        0.018           4           0          4 cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C
+  0.0        0.011        0.011          18           0          1 cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C
+  0.0        0.008        0.008           9           0          1 cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C
+  0.0        0.007        0.007           1           0          7 cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C
+  0.0        0.007        0.007          21           0          0 cl_int clReleaseContext(cl_context) C
+  0.0        0.007        0.007           5           0          1 cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C
+  0.0        0.005        0.005           9           0          1 cl_int clReleaseProgram(cl_program) C
+  0.0        0.005        0.005           9           0          1 cl_int clRetainCommandQueue(cl_command_queue) C
+  0.0        0.005        0.005          20           0          0 cl_int clRetainContext(cl_context) C
+  0.0        0.005        0.005          14           0          0 cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C
+  0.0        0.004        0.004          18           0          0 cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C
+  0.0        0.003        0.003           1           0          3 cl_int clFinish(cl_command_queue) C
+  0.0        0.002        0.002           2           0          1 cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C
+  0.0        0.002        0.002           2           0          1 cl_int clRetainKernel(cl_kernel) C
+  0.0        0.001        0.001           2           0          0 cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C
+  0.0        0.001        0.001           3           0          0 cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001           2           0          0 cl_int clReleaseKernel(cl_kernel) C
+  0.0            0            0           2           0          0 cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C
+
+FUNCTION SUMMARY (mean):
+---------------------------------------------------------------------------------------
+%Time    Exclusive    Inclusive       #Call      #Subrs  Inclusive Name
+              msec   total msec                          usec/call 
+---------------------------------------------------------------------------------------
+100.0          136       10,997           1         3.5   10997020 .TAU application
+ 49.9        5,370        5,492         0.5        89.5   10985515 taupreload_main
+ 48.8        5,367        5,367         0.5           0   10735516 matrix_mul
+  0.5           57           57         0.5           0     114368 cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *) C
+  0.5           56           56           1           0      56124 cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *) C
+  0.1            6            6         4.5           0       1541 cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *) C
+  0.0            1            1         1.5           0        801 cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) C
+  0.0        0.507        0.507           1           0        508 cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) C
+  0.0        0.336        0.336           1           0        336 WriteBuffer
+  0.0       0.0635       0.0635         0.5           0        127 cl_int clEnqueueCopyBuffer(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *) C
+  0.0       0.0595       0.0595         4.5           0         13 cl_kernel clCreateKernel(cl_program, const char *, cl_int *) C
+  0.0       0.0138       0.0138           1           0         14 fill
+  0.0       0.0121       0.0121         0.5           0         24 CopyBuffer
+  0.0        0.012        0.012           5           0          2 cl_int clReleaseCommandQueue(cl_command_queue) C
+  0.0        0.009        0.009           2           0          4 cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *) C
+  0.0       0.0055       0.0055           9           0          1 cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *) C
+  0.0        0.004        0.004         4.5           0          1 cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *) C
+  0.0       0.0035       0.0035         0.5           0          7 cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *) C
+  0.0       0.0035       0.0035        10.5           0          0 cl_int clReleaseContext(cl_context) C
+  0.0       0.0035       0.0035         2.5           0          1 cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *) C
+  0.0       0.0025       0.0025         4.5           0          1 cl_int clReleaseProgram(cl_program) C
+  0.0       0.0025       0.0025         4.5           0          1 cl_int clRetainCommandQueue(cl_command_queue) C
+  0.0       0.0025       0.0025          10           0          0 cl_int clRetainContext(cl_context) C
+  0.0       0.0025       0.0025           7           0          0 cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *) C
+  0.0        0.002        0.002           9           0          0 cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) C
+  0.0       0.0015       0.0015         0.5           0          3 cl_int clFinish(cl_command_queue) C
+  0.0        0.001        0.001           1           0          1 cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *) C
+  0.0        0.001        0.001           1           0          1 cl_int clRetainKernel(cl_kernel) C
+  0.0       0.0005       0.0005           1           0          0 cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) C
+  0.0       0.0005       0.0005         1.5           0          0 cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *) C
+  0.0       0.0005       0.0005           1           0          0 cl_int clReleaseKernel(cl_kernel) C
+  0.0            0            0           1           0          0 cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) C
diff --git a/tautrace.0.0.0.trc b/tautrace.0.0.0.trc
new file mode 100644
index 0000000000000000000000000000000000000000..86d787d9f36f5aec24fea44fbd861095204f454f
Binary files /dev/null and b/tautrace.0.0.0.trc differ
diff --git a/tautrace.0.0.1.trc b/tautrace.0.0.1.trc
new file mode 100644
index 0000000000000000000000000000000000000000..e4bd1dcb4e9142db8c1975325bb189147a5c67e3
Binary files /dev/null and b/tautrace.0.0.1.trc differ