Last active
August 6, 2024 12:00
-
-
Save jingwangsg/986191573494317b0762ccc183a7f54c to your computer and use it in GitHub Desktop.
sailconfig
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| if status is-interactive | |
| # Commands to run in interactive sessions can go here | |
| end | |
| # ================== PATH ================== | |
| # set -x PATH | |
| set -x AWS_ACCESS_KEY_ID wangjing01 | |
| set -x AWS_SECRET_ACCESS_KEY jm2E6gBaFZvD | |
| set -x TZ Asia/Singapore | |
| # ================== colored print ================== | |
| function mount | |
| rclone mount sail_sg:/home/aiops/wangjing/ ~/MOUNT/sail_sg/ --daemon --daemon-wait 0 | |
| rclone mount sail_my:/home/aiops/wangjing/ ~/MOUNT/sail_my/ --daemon --daemon-wait 0 | |
| end | |
| function red_print | |
| echo -e "\033[31m$argv\033[0m" | |
| end | |
| function green_print | |
| echo -e "\033[32m$argv\033[0m" | |
| end | |
| function blue_print | |
| echo -e "\033[34m$argv\033[0m" | |
| end | |
| # ================== other functions ================== | |
| # https://github.com/junegunn/fzf/issues/868 implements fuzzy complete in fish | |
| # function clia | |
| # read -l line | |
| # commandline -a $line | |
| # # commandline -a | |
| # end | |
| # function fuzzy_complete | |
| # complete -C | sort -u | fzf --height 40% --multi --reverse -q (commandline -t) | cut --output-delimiter ' ' -f1 | sed s/-//g | clia | |
| # commandline -f end-of-line | |
| # end | |
| # bind -M insert \t fuzzy_complete | |
| function knkill | |
| if test "$argv[1]" = ALL | |
| set argv | |
| end | |
| if test -z "$argv[1]" | |
| set filter python | |
| else | |
| set filter $argv[1] | |
| end | |
| ps -u (whoami) --no-headers -o pid,comm= | grep -v -E "^\$|((string echo $PPID))|tmux|bash|fish" | grep -- $filter | awk '{print $1}' | xargs kill -9 | |
| end | |
| function rl | |
| readlink -f $argv | |
| end | |
| # ================ sail utility functions ================ | |
| function select_jobs | |
| # Get a list of all job IDs | |
| set job_ids (sailctl job list) | |
| # Prompt the user to enter an index | |
| echo "Enter the index of the job you want to select:" | |
| read -l index | |
| # Check if the entered index is valid | |
| if test $index -ge 1 -a $index -le (count $job_ids) | |
| # Select the job ID from the list | |
| set selected_job_id $job_ids[$index] | |
| # Do something with the selected job ID | |
| echo $selected_job_id | |
| else | |
| echo "Invalid index. Please enter a number between 1 and "(count $job_ids)"." | |
| end | |
| end | |
| function scto_my | |
| curl -SsL https://download.sail.insea.io/quickstart.sh | bash -s -- --project generative-model --cluster tmkv-1 | |
| end | |
| function scto_sg | |
| curl -SsL https://download.sail.insea.io/quickstart.sh | bash -s -- --project generative-model --cluster sail-im-1 | |
| end | |
| function scfwd | |
| if test -z $argv | |
| scfwd_auto | |
| else | |
| _scfwd $argv | |
| end | |
| end | |
| function _scfwd | |
| set -l pod_name $argv[1] | |
| set -l port 2222 | |
| set -l remote_port 22 | |
| set -l num_args (count $argv) | |
| switch $num_args | |
| case 2 | |
| set port $argv[2] | |
| case 3 | |
| set port $argv[2] | |
| set remote_port $argv[3] | |
| end | |
| green_print "Forwarding port $port to pod $pod_name ..." | |
| kubectl port-forward pod/$pod_name $port:$remote_port | |
| end | |
| function scfwd_auto | |
| set -l port (get_avail_port) | |
| set -l remote_port 22 | |
| set pod_name (knp | fzf | awk '{print $1}') | |
| _scfwd $pod_name $port $remote_port | |
| end | |
| function _delete_empty_args | |
| for arg in $argv | |
| if test -n "$arg" | |
| echo $arg | |
| end | |
| end | |
| end | |
| function _scssh | |
| set -l pod_name $argv[1] | |
| set argv[1] "" | |
| set -l PATH_VAR "export HOME=/home/aiops/wangjing; export PATH=~/homebrew/bin/:~/miniconda3/bin:\$PATH " | |
| set -l conda_run "" | |
| set -l num_args (count $argv) | |
| for i in (seq $num_args) | |
| switch $argv[$i] | |
| case -c --conda | |
| set name $argv[(math $i + 1)] | |
| set conda_run "conda run --no-capture-output -n $name" | |
| set argv[$i] "" | |
| set argv[(math $i + 1)] "" | |
| end | |
| end | |
| set argv (_delete_empty_args $argv) | |
| if test $num_args -gt 1 | |
| set cmd "$conda_run $argv" | |
| green_print "ENV: $PATH_VAR" | |
| green_print "Execute on $pod_name: $cmd" | |
| set cmd bash -c "$PATH_VAR; $cmd" | |
| else | |
| set cmd bash -c "$PATH_VAR; cd \$HOME; exec /bin/bash" | |
| green_print "Connecting to $pod_name ..." | |
| end | |
| kubectl exec -it $pod_name -- $cmd | |
| end | |
| function scssh_auto | |
| set pod_name (knp | fzf | awk '{print $1}') | |
| _scssh $pod_name | |
| end | |
| function scssh | |
| if test -z "$argv" | |
| scssh_auto | |
| else | |
| _scssh $argv | |
| end | |
| end | |
| function scjc_tunnel | |
| set -l job_name (scjc $argv | tee /dev/tty | tail -n 1) | |
| set -l pod_name (scpod $job_name | awk '{print $1}') | |
| green_print "Pod created: $pod_name" | |
| scssh_tunnel $pod_name | |
| end | |
| function _scssh_tunnel | |
| set -l pod_name $argv[1] | |
| set -l tunnel_name $argv[2] | |
| scssh $pod_name "bash /home/aiops/wangjing/WORKSPACE/sail_util/scripts/init.sh $tunnel_name" | |
| end | |
| function scssh_tunnel_auto | |
| set -l pod_name (knp | fzf | awk '{print $1}') | |
| _scssh_tunnel $pod_name $argv[1] | |
| end | |
| function scssh_tunnel | |
| set -l num_args (count $argv) | |
| if test $num_args -lt 2 | |
| set tunnel_name debug | |
| if test $num_args -eq 1 | |
| set tunnel_name $argv[1] | |
| end | |
| scssh_tunnel_auto $tunnel_name | |
| else | |
| _scssh_tunnel $argv | |
| end | |
| end | |
| function sc | |
| sailctl $argv | |
| end | |
| function scj | |
| sailctl job $argv | |
| end | |
| function _scjc_parse_args | |
| set name gpu | |
| set gpu 0 | |
| set nodes 1 | |
| set priority low | |
| set time 16 | |
| set high_vram 0 | |
| set email 0 | |
| set tmux 0 | |
| set is_wait 0 | |
| set mount_s3 0 | |
| set config "" | |
| set args "" | |
| # if first argument does not start with "-", it is the name of the job | |
| if not string match -q -- "-*" $argv[1] | |
| set name $argv[1] | |
| set --erase argv[1] | |
| end | |
| set num_argv (count $argv) | |
| if test $num_argv -gt 0 | |
| for i in (seq $num_argv) | |
| switch $argv[$i] | |
| case -g --gpu | |
| set gpu $argv[(math $i + 1)] | |
| set argv[$i] "" | |
| set argv[(math $i + 1)] "" | |
| case -r --replica | |
| set nodes $argv[(math $i + 1)] | |
| set argv[$i] "" | |
| set argv[(math $i + 1)] "" | |
| case -p --priority | |
| set priority $argv[(math $i + 1)] | |
| set argv[$i] "" | |
| set argv[(math $i + 1)] "" | |
| case -a --args | |
| # read until next flag | |
| set args "" | |
| set j (math $i + 1) | |
| while test (math $num_argv + 1) -gt $j | |
| if string match -q -- "-*" $argv[$j] | |
| break | |
| end | |
| set args "$args $argv[$j]" | |
| set argv[$j] "" | |
| set j (math $j + 1) | |
| end | |
| set argv[$i] "" | |
| case -h --highvram | |
| set argv[$i] "" | |
| set high_vram 1 | |
| case -t --time | |
| set time $argv[(math $i + 1)] | |
| set argv[$i] "" | |
| set argv[(math $i + 1)] "" | |
| case --tmux | |
| set argv[$i] "" | |
| set tmux 1 | |
| case -e --email | |
| set argv[$i] "" | |
| set email 1 | |
| case --config | |
| set config $argv[(math $i + 1)] | |
| set argv[$i] "" | |
| set argv[(math $i + 1)] "" | |
| case -w --wait | |
| set argv[$i] "" | |
| set is_wait 1 | |
| case --mount-s3 | |
| set argv[$i] "" | |
| set mount_s3 1 | |
| end | |
| end | |
| end | |
| set other_argv (_delete_empty_args $argv) | |
| echo $name"<SEP>"$gpu"<SEP>"$nodes"<SEP>"$priority"<SEP>"$time"<SEP>"$high_vram"<SEP>"$email"<SEP>"$config"<SEP>"$tmux"<SEP>"$is_wait"<SEP>"$mount_s3"<SEP>"$args"<SEP>" | |
| echo $other_argv | |
| end | |
| function scjc | |
| set -l context (string split " " (_get_context)) | |
| set platform $context[1] | |
| set cluster $context[2] | |
| # get args | |
| set -l outputs (string split "<SEP>" (_scjc_parse_args $argv)) | |
| set -l name $outputs[1] | |
| set -l gpu $outputs[2] | |
| set -l nodes $outputs[3] | |
| set -l priority $outputs[4] | |
| set -l time $outputs[5] | |
| set -l high_vram $outputs[6] | |
| set -l email $outputs[7] | |
| set -l config $outputs[8] | |
| set -l tmux $outputs[9] | |
| set -l is_wait $outputs[10] | |
| set -l mount_s3 $outputs[11] | |
| set -l args $outputs[12] | |
| set -l other_argv $outputs[13] | |
| if test $nodes -gt 1 | |
| red_print "Creating a job with $nodes nodes leads to high priority!" | |
| set priority high | |
| end | |
| set -l job_name $name"g"$gpu"r"$nodes | |
| if test $high_vram -eq 1 | |
| set job_name $job_name"hv" | |
| end | |
| green_print "name\t\t$name" | |
| green_print "gpu\t\t$gpu" | |
| green_print "nodes\t\t$nodes" | |
| green_print "priority\t$priority" | |
| green_print "job_name\t$job_name" | |
| green_print "high_vram\t$high_vram" | |
| green_print "time\t\t$time" | |
| green_print "email\t\t$email" | |
| green_print "mount_s3\t$mount_s3" | |
| green_print "config\t\t$config" | |
| green_print "tmux\t\t$tmux" | |
| # warning for high priority jobs | |
| if test $priority = high | |
| red_print "[Warning] Creating a job with high priority!" | |
| end | |
| # if args is empty, don't use --args for sailtctl | |
| set sailctl_kwargs "" | |
| if test $gpu -gt 0 | |
| set sailctl_kwargs "$sailctl_kwargs -g $gpu" | |
| end | |
| if test $high_vram -eq 1 | |
| set sailctl_kwargs "$sailctl_kwargs --high-vram" | |
| end | |
| if test -n $config | |
| set config_dir "$HOME/WORKSPACE/sail_util/configs/sailctl" | |
| set sailctl_kwargs "$sailctl_kwargs -f $config_dir/$config.yaml" | |
| end | |
| if test $email -eq 1 | |
| set email_command "~/miniconda3/bin/email -m \'Job $job_name ($cluster) is running\'" | |
| end | |
| if test $mount_s3 -eq 1 | |
| set sailctl_kwargs "$sailctl_kwargs --mount-s3" | |
| end | |
| if test $nodes -gt 1 | |
| set sailctl_kwargs "$sailctl_kwargs --image asia-docker.pkg.dev/sail-tpu-02/images/common/golden-image:12.3" | |
| end | |
| set cmd "expect -c ' | |
| spawn sailctl job create $job_name --debug -r $nodes -p $priority $sailctl_kwargs --args | |
| " | |
| set sleep_seconds (math $time x 3600) | |
| if test -z $args | |
| set args "echo Hi" | |
| end | |
| set run_cmd "$args" | |
| if test $tmux -eq 1 | |
| set run_cmd "tmux new-session -d -s main \\\"source ~/.bashrc; $args; sleep $sleep_seconds\\\" ;sleep $sleep_seconds" | |
| end | |
| if test $nodes -eq 1 | |
| set cmd "$cmd | |
| expect \"Please enter your arguments (multi-line format, press Ctrl+D to finish):\" | |
| send \"export HOME=/home/aiops/wangjing/\\r\" | |
| send \"export PATH=~/homebrew/bin/:~/miniconda3/bin:\\\$PATH\\r\" | |
| send \"source \\\$HOME/.bashrc\\r\" | |
| send \"cd \\\$HOME\\r\" | |
| send \"$email_command\\r\" | |
| send \"$run_cmd\\r\" | |
| send \"\004\" | |
| " | |
| else | |
| set cmd "$cmd | |
| expect \"Please enter your master command (multi-line format, press Ctrl+D to finish):\" | |
| send \"export HOME=/home/aiops/wangjing/\\r\" | |
| send \"export PATH=~/homebrew/bin/:~/miniconda3/bin:\\\$PATH\\r\" | |
| send \"source \\\$HOME/.bashrc\\r\" | |
| send \"cd \\\$HOME\\r\" | |
| send $email_command\\r | |
| send \"$run_cmd\\r\" | |
| send \"\004\" | |
| " | |
| set cmd "$cmd | |
| expect \"Please enter your worker command (multi-line format, press Ctrl+D to finish):\" | |
| send \"export HOME=/home/aiops/wangjing/\\r\" | |
| send \"export PATH=~/homebrew/bin/:~/miniconda3/bin:\\\$PATH\\r\" | |
| send \"source \\\$HOME/.bashrc\\r\" | |
| send \"cd \\\$HOME\\r\" | |
| send $email_command\\r | |
| send \"$run_cmd\\r\" | |
| send \"\004\" | |
| " | |
| end | |
| set cmd "$cmd | |
| interact | |
| '" | |
| set -l create_output (eval $cmd | tee /dev/tty | tail -n 1) | |
| set pod_status NA | |
| set job_name (string trim -- (string split "/" $create_output)[-1]) # a \r will be appended to the job_name | |
| # ! Deprecated | |
| # blue_print "Waiting for pod to be Running..." | |
| # set schedule_info_printed false | |
| # while true | |
| # set pod_status (knp | grep $job_name | awk '{print $6}') | |
| # # when length larger then 0 | |
| # if test (count $pod_status) -gt 0 | |
| # # pod is created | |
| # set pod_counts (count $pod_status) | |
| # set pod_status (string split "\n" $pod_status | head -n 1) | |
| # else | |
| # if test $schedule_info_printed = false | |
| # set schedule_info (curl https://scheduler.$cluster.insea.io/scheduler_info.txt -s) | |
| # set cur_schedule_info (printf "%s\n" $schedule_info | grep $job_name | grep "priority_class" | awk '{print $4,$5,$17,$18,$19}') | |
| # if [ "$cur_schedule_info" != "" ] | |
| # red_print $cur_schedule_info | |
| # set schedule_info_printed true | |
| # if test $is_wait -eq 0 | |
| # # query only once | |
| # break | |
| # end | |
| # end | |
| # end | |
| # sleep 5 | |
| # continue | |
| # end | |
| # # master node is running and pod_counts is equal to nodes | |
| # if test $pod_status = Running; and test $pod_counts -eq $nodes | |
| # set priority (knp | grep $job_name | awk '{print $4}' | head -n 1) | |
| # if test $priority = high | |
| # red_print "[Warning] High priority job is running!" | |
| # end | |
| # green_print "Ready!" | |
| # echo $job_name | |
| # break | |
| # else | |
| # sleep 3 | |
| # end | |
| # end | |
| end | |
| function aws3 | |
| aws s3 --endpoint-url https://pub2.s3g.data-infra.shopee.io $argv | |
| end | |
| function get_avail_port | |
| set -l port 2222 | |
| while true | |
| nc -z localhost $port >/dev/null 2>&1 | |
| if test $status -ne 0 | |
| break | |
| end | |
| set port (math $port + 1) | |
| end | |
| echo $port | |
| end | |
| function scjc_fwd | |
| set -l job_name (scjc $argv | tee /dev/tty | tail -n 1) | |
| set -l pod_name (scpod $job_name | awk '{print $1}') | |
| green_print "Pod created: $pod_name" | |
| set -l port (get_avail_port) | |
| scfwd $pod_name $port 22 | |
| end | |
| function scpods_all | |
| set temp1 (mktemp) | |
| set temp2 (mktemp) | |
| kubectl get pods -o 'custom-columns=NAME:.metadata.name,OWNER:.metadata.labels.owner,GPU:.spec.containers[0].resources.requests.nvidia\.com/gpu,READY:.status.conditions[?(@.type=="Ready")].status,PRIORITY:.spec.priorityClassName,STATUS:.status.phase,IP:.status.podIP' $argv >$temp1 | |
| kubectl get pods --sort-by=.metadata.creationTimestamp $argv | awk '{print $5}' >$temp2 | |
| paste $temp1 $temp2 | |
| rm $temp1 $temp2 | |
| end | |
| function scpods_running | |
| scpods_all --sort-by=.metadata.creationTimestamp --field-selector=status.phase=Running | |
| end | |
| function scpods | |
| # for unknown reason, the output of scpods cannot be used for fzf | |
| # thus, most function ends with _auto will fail | |
| set -l result (scpods_running | tee /dev/tty) | |
| # replace space in result with newline | |
| set -l num_gpus (string split "\n" -- $result | awk '$3 != "<none>" && $3 != "GPU" {print $3}' | awk '{sum += $1} END {print sum}') | |
| green_print "Total GPUs: $num_gpus" | |
| end | |
| function _scpod_args | |
| set pod_name $argv[1] | |
| kubectl get pod $pod_name -o jsonpath='{.spec.containers[0].args}' | |
| end | |
| function scpod_args_auto | |
| set pod_info (scpods_running | tail -n +2 | fzf ) | |
| set pod_name (echo $pod_info | awk '{print $1}') | |
| echo $pod_info | |
| _scpod_args $pod_name | |
| end | |
| function scpod_args | |
| if test -z "$argv" | |
| scpod_args_auto | |
| else | |
| _scpod_args $argv | |
| end | |
| end | |
| function knp | |
| scpods_running | grep wangjing | |
| end | |
| function knp_all | |
| scpods_all | grep wangjing | |
| end | |
| function scq | |
| sailctl get quota | |
| end | |
| function scpod | |
| set job $argv[1] | |
| kubectl get pods -l job-name=$job --no-headers | |
| end | |
| function _get_context | |
| set context (kubectl config current-context) | |
| if test $context = "generative-model@sail-im-1" | |
| echo argo sail | |
| else | |
| echo margo tmkv-1 | |
| end | |
| end | |
| function scl | |
| set -l context (string split " " (_get_context)) | |
| set platform $context[1] | |
| set cluster $context[2] | |
| set jobs (sailctl job list) | |
| set running_info (scpods_running) | |
| set schedule_info (curl https://scheduler.$cluster.insea.io/scheduler_info.txt -s) | |
| for job in $jobs | |
| set pod_info (string split "\n" $running_info | grep $job | awk '{print $1,$6,$8}' | head -n 1) | |
| if test -z $pod_info | |
| # https://stackoverflow.com/questions/34166077/multi-line-variables-remove-new-line-character-fish | |
| set cur_schedule_info (printf "%s\n" $schedule_info | grep $job | grep "priority_class" | awk '{print $4,$5,$17,$18,$19}') | |
| echo $job https://$platform.sail.insea.io/applications/$job (red_print $cur_schedule_info) | |
| else | |
| echo $job https://$platform.sail.insea.io/applications/$job (green_print $pod_info) | |
| end | |
| end | |
| end | |
| function _sckill | |
| set job_pattern $argv[1] | |
| # ! sailctl job delete not working so far | |
| # ! supposedly, sailctl should also terminate pods, but it does not | |
| set job_names (scl | awk '{print $1}' | grep $job_pattern) | |
| for job in $job_names | |
| red_print "sailctl job delete --force $job" | |
| set pod_names (knp | grep $job | awk '{print $1}') | |
| sailctl job delete --force $job | |
| for pod_name in $pod_names | |
| set cmd "kubectl delete pod $pod_name -n generative-model --now=true &" | |
| # --now=true means terminate immediately | |
| red_print $cmd | |
| eval $cmd | |
| end | |
| end | |
| end | |
| function sckill | |
| set job $argv[1] | |
| if test -z $job | |
| sckill_auto | |
| return | |
| else | |
| _sckill $job | |
| end | |
| end | |
| function sckill_auto | |
| set job ( scl | fzf | awk '{print $1}') | |
| _sckill $job | |
| end | |
| function sckillall | |
| set -l jobs (sailctl job list) | |
| set -l force false | |
| for arg in $argv | |
| switch $arg | |
| case -f --force | |
| set force true | |
| end | |
| end | |
| if test $force = false | |
| read -P (red_print "Do you want to proceed with killing $jobs: ") choice | |
| if test "$choice" != y | |
| echo "Aborting killing process." | |
| return | |
| end | |
| end | |
| for job in $jobs | |
| sckill $job | |
| end | |
| end | |
| # >>> conda initialize >>> | |
| # !! Contents within this block are managed by 'conda init' !! | |
| if status is-interactive; and test -f /Users/SG3736/miniconda3/bin/conda | |
| eval /Users/SG3736/miniconda3/bin/conda "shell.fish" hook $argv | source | |
| else | |
| if test -f "/Users/SG3736/miniconda3/etc/fish/conf.d/conda.fish" | |
| . "/Users/SG3736/miniconda3/etc/fish/conf.d/conda.fish" | |
| else | |
| set -x PATH /Users/SG3736/miniconda3/bin $PATH | |
| end | |
| end | |
| # <<< conda initialize <<< |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment