Skip to content

Instantly share code, notes, and snippets.

View cyb70289's full-sized avatar

Yibo Cai cyb70289

  • Arm
  • Shanghai
View GitHub Profile
@cyb70289
cyb70289 / a.txt
Last active January 6, 2026 07:08
torch-mm-accuracy
-0.808594,-1.531250,0.406250,0.171875,-0.247070,0.204102,-0.878906,-0.386719,0.601562,0.267578,-0.851562,-0.289062,1.000000,-0.781250,1.375000,0.118652,0.863281,-0.765625,0.824219,-1.187500,-0.032959,-0.080078,0.078125,-0.648438,-0.474609,-0.667969,1.054688,-0.035889,-1.320312,-0.671875,0.041504,-0.644531,0.263672,0.707031,-0.218750,2.890625,1.367188,-0.108398,0.240234,1.835938,1.750000,-0.045898,-0.351562,-0.096191,-1.765625,-0.910156,-0.597656,0.660156,0.039307,1.609375,-1.507812,1.265625,0.222656,-1.031250,0.605469,-0.470703,1.640625,-0.308594,-0.069336,-1.046875,0.621094,-0.145508,0.324219,2.390625,1.414062,-0.214844,0.349609,-0.917969,-0.322266,-0.199219,-2.171875,-0.318359,0.283203,0.302734,1.140625,-0.027832,0.644531,0.253906,1.015625,-2.671875,-1.125000,0.535156,-2.375000,-0.234375,1.507812,0.531250,-1.781250,0.228516,-1.304688,1.382812,1.656250,-0.018188,0.037109,-0.034424,-0.221680,0.835938,-0.703125,0.034180,-0.059814,1.484375,1.562500,-0.875000,0.781250,0.570312,-0.742188,-0.149414,-0.679688,-1.30
@cyb70289
cyb70289 / log
Created July 15, 2025 10:58
vllm log
$ DNNL_VERBOSE=all LD_PRELOAD="/usr/lib/`uname -m`-linux-gnu/libtcmalloc_minimal.so.4:/usr/lib/`uname -m`-linux-gnu/libgomp.so.1" OMP_NUM_THREADS=60 VLLM_CPU_OMP_THREADS_BIND="1-60" VLLM_TARGET_DEVICE=cpu VLLM_CPU_KVCACHE_SPACE=20 vllm serve RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8 --trust-remote-code --max-model-len=2048 --enforce-eager --host 0.0.0.0 --port 8000
onednn_verbose,v1,info,oneDNN v3.7.1 (commit 8d263e693366ef8db40acc569cc7d8edf644556d)
onednn_verbose,v1,info,cpu,runtime:OpenMP,nthr:60
onednn_verbose,v1,info,cpu,isa:AArch64 SVE (128 bits)
onednn_verbose,v1,info,gpu,runtime:none
onednn_verbose,v1,info,graph,backend,0:dnnl_backend
onednn_verbose,v1,primitive,info,template:operation,engine,primitive,implementation,prop_kind,memory_descriptors,attributes,auxiliary,problem_desc,exec_time
onednn_verbose,v1,graph,info,template:operation,engine,partition_id,partition_kind,op_names,data_formats,logical_tensors,fpmath_mode,implementation,backend,exec_time
INFO 07-15 10:50:56 [__init__.py:253] Au
@cyb70289
cyb70289 / llama-model-interleave.diff
Created April 2, 2025 02:04
llama model interleave
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 9a4ee49..137e3ea 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -337,6 +337,11 @@ set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of
set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
+foreach(lib "ggml" "ggml-base")
+ target_link_libraries(${lib} PUBLIC numa)
[root@lnd-jm02 iscsi-hardlock]# ./test-lse
Counters: 17171 48463 25182 21011 24232 24657 20559 15806 22230 42319 32158 16342 33570 29741 26467 19637 24702 26923
Counters: 32542 38140 21447 21856 25708 21327 13062 23763 24566 35316 36268 24727 28855 23722 34471 28493 17219 13948
Counters: 39521 29413 29365 26218 24139 10212 34962 16706 6899 30642 54875 30239 29795 22390 13964 29797 14388 21804
Counters: 39066 24223 10731 19933 41360 19116 17735 38524 24549 34395 22350 21955 28910 27429 33503 20678 24180 16701
Counters: 45116 30269 18649 32161 14482 25578 34041 29304 38479 20491 30742 29352 26589 17246 22371 23332 9264 18094
Counters: 35308 41495 16516 28869 27017 19990 22366 28377 18060 13942 29316 25233
@cyb70289
cyb70289 / gist:09438df3f78dafd6f5de7c4876daafe7
Created August 27, 2024 04:29
0002-optimize-varint-with-lookup-table.patch
From 2eee837216e575abd9e48a30c65161a42ad59117 Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Wed, 21 Aug 2024 06:40:48 -0400
Subject: [PATCH 2/2] optimize varint with lookup table
---
src/google/protobuf/io/coded_stream.h | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/src/google/protobuf/io/coded_stream.h b/src/google/protobuf/io/coded_stream.h
@cyb70289
cyb70289 / gist:d947c64cf0f116607140e06b535b95aa
Created August 27, 2024 03:52
0001-add-string-list-and-map-benchmarks.patch
From a5239aaf894334f4b4c331f1d40245f5f64a2cd4 Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Wed, 21 Aug 2024 06:40:29 -0400
Subject: [PATCH 1/2] add string list and map benchmarks
---
benchmarks/benchmark.cc | 65 +++++++++++++++++++++++++++++++++++++
benchmarks/descriptor.proto | 8 +++++
2 files changed, 73 insertions(+)
@cyb70289
cyb70289 / optimize-decode.diff
Created July 18, 2024 03:20
sonic-decode-opt
diff --git a/cmake/set_arch_flags.cmake b/cmake/set_arch_flags.cmake
index 538ddfe..81c40e4 100644
--- a/cmake/set_arch_flags.cmake
+++ b/cmake/set_arch_flags.cmake
@@ -2,8 +2,8 @@ function(set_arch_flags target arch)
message(STATUS "Setting architecture flags for ${arch}")
if(arch MATCHES "x86_64")
target_compile_options(${target} PRIVATE -mavx2 -mpclmul -mbmi -mlzcnt)
- elseif(arch MATCHES "arm")
- target_compile_options(${target} PRIVATE -march=armv8-a)
@cyb70289
cyb70289 / optimize-skipstring.diff
Created July 4, 2024 03:24
optimize skipstring with sve2 match
diff --git a/cmake/set_arch_flags.cmake b/cmake/set_arch_flags.cmake
index 538ddfe..6dc7754 100644
--- a/cmake/set_arch_flags.cmake
+++ b/cmake/set_arch_flags.cmake
@@ -2,8 +2,8 @@ function(set_arch_flags target arch)
message(STATUS "Setting architecture flags for ${arch}")
if(arch MATCHES "x86_64")
target_compile_options(${target} PRIVATE -mavx2 -mpclmul -mbmi -mlzcnt)
- elseif(arch MATCHES "arm")
- target_compile_options(${target} PRIVATE -march=armv8-a)
diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
index 5c7c6d5..dc55c69 100644
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
@@ -2680,6 +2680,55 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
return safe_ptr_assign(*reorder_pd, _pd.release());
}
+#define MY_REORDER
+
##################################################################
# profile.py
##################################################################
import tensorflow as tf
import timeit
import os
n_threads = int(os.getenv('OMP_NUM_THREADS'))
if n_threads < 1 or n_threads > 999: