本文主要记录一些实践过程,主要分析了在CFS下是如何计算CPU使用率的,参考systemd-cgtop的代码
1 测试案例
[root@ecs]# mount -t cgroup
cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd)
cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpuacct,cpu)
cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio)
cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb)
cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)
cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices)
cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer)
cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_prio,net_cls)
cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset)
cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids)
cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event)
[root@ecs cpu]# lssubsys -m
cpuset /sys/fs/cgroup/cpuset
cpu,cpuacct /sys/fs/cgroup/cpu,cpuacct
memory /sys/fs/cgroup/memory
devices /sys/fs/cgroup/devices
freezer /sys/fs/cgroup/freezer
net_cls,net_prio /sys/fs/cgroup/net_cls,net_prio
blkio /sys/fs/cgroup/blkio
perf_event /sys/fs/cgroup/perf_event
hugetlb /sys/fs/cgroup/hugetlb
pids /sys/fs/cgroup/pids
1.1 cpu限制(cpu.cfs_quota_us)
创建group:ruletest
[root@ecs ~]# rmdir /sys/fs/cgroup/cpu/ruletest
[root@ecs ~]# mkdir /sys/fs/cgroup/cpu/ruletest
[root@ecs ~]# cd /sys/fs/cgroup/cpu/ruletest
[root@ecs /sys/fs/cgroup/cpu/ruletest]# ll
total 0
-rw-r--r-- 1 root root 0 Jun 14 14:50 cgroup.clone_children
--w--w--w- 1 root root 0 Jun 14 14:50 cgroup.event_control
-rw-r--r-- 1 root root 0 Jun 14 14:50 cgroup.procs
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.cfs_period_us
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.cfs_quota_us
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.rt_period_us
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.rt_runtime_us
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.shares
-r--r--r-- 1 root root 0 Jun 14 14:50 cpu.stat
-r--r--r-- 1 root root 0 Jun 14 14:50 cpuacct.stat
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpuacct.usage
-r--r--r-- 1 root root 0 Jun 14 14:50 cpuacct.usage_percpu
-rw-r--r-- 1 root root 0 Jun 14 14:50 notify_on_release
-rw-r--r-- 1 root root 0 Jun 14 14:50 tasks
这个cgoup的cpu限制
[root@ecs /sys/fs/cgroup/cpu/ruletest]# cat /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us
-1
[root@ecs /sys/fs/cgroup/cpu/ruletest]# echo 20000 > /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us
[root@ecs /sys/fs/cgroup/cpu/ruletest]# cat /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us
20000
运行一个跑满CPU的程序,并观察状态
int main(void)
{
int i = 0;
for(;;) i++;
return 0;
}
一核被跑满
top - 15:00:14 up 19 days, 1:26, 3 users, load average: 0.44, 0.13, 0.20
Tasks: 1 total, 1 running, 0 sleeping, 0 stopped, 0 zombie
%Cpu0 : 0.3 us, 0.0 sy, 0.0 ni, 99.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu1 :100.0 us, 0.0 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu2 : 5.0 us, 2.3 sy, 0.0 ni, 92.6 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu3 : 6.0 us, 1.7 sy, 0.0 ni, 92.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu4 : 3.7 us, 1.0 sy, 0.0 ni, 95.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu5 : 0.3 us, 0.3 sy, 0.0 ni, 99.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu6 : 2.3 us, 1.3 sy, 0.0 ni, 96.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu7 : 2.0 us, 1.7 sy, 0.0 ni, 96.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
KiB Mem : 32779804 total, 3207136 free, 2965740 used, 26606928 buff/cache
KiB Swap: 1048572 total, 1048572 free, 0 used. 28961712 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
910 root 20 0 4208 356 280 R 99.7 0.0 0:30.71 ./deadloop
把进程910加到这个cgroup中:
[root@ecs ~/tmp]# echo 910 > /sys/fs/cgroup/cpu/ruletest/tasks
[root@ecs ~/tmp]# cat /sys/fs/cgroup/cpu/ruletest/tasks
910
然后,就会在top中看到CPU的利用立马下降成20%了。(前面我们设置的20000就是20%的意思)
[root@ecs ~/tmp]# top -p 910
top - 15:06:43 up 19 days, 1:33, 3 users, load average: 0.01, 0.16, 0.22
Tasks: 1 total, 1 running, 0 sleeping, 0 stopped, 0 zombie
%Cpu0 : 4.3 us, 1.7 sy, 0.0 ni, 94.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu1 : 3.3 us, 2.0 sy, 0.0 ni, 94.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu2 : 2.0 us, 0.3 sy, 0.0 ni, 97.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu3 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu4 : 20.7 us, 0.3 sy, 0.0 ni, 79.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu5 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu6 : 6.4 us, 2.0 sy, 0.0 ni, 91.6 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu7 : 2.7 us, 1.7 sy, 0.0 ni, 95.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
KiB Mem : 32779804 total, 3206684 free, 2966044 used, 26607076 buff/cache
KiB Swap: 1048572 total, 1048572 free, 0 used. 28961312 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
910 root 20 0 4208 356 280 R 20.0 0.0 3:09.83 ./deadloop
1.2 内存限制(memory.limit_in_bytes)
创建不断吃内存的程序
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
int main(void)
{
int size = 0;
int chunk_size = 512;
void *p = NULL;
while(1) {
if ((p = malloc(chunk_size)) == NULL) {
printf("out of memory!!\n");
break;
}
memset(p, 1, chunk_size);
size += chunk_size;
printf("[%d] - memory is allocated [%8d] bytes \n", getpid(), size);
sleep(1);
}
return 0;
}
[root@ecs ~/tmp]# ./a.out
[1236] - memory is allocated [ 512] bytes
[1236] - memory is allocated [ 1024] bytes
[1236] - memory is allocated [ 1536] bytes
[1236] - memory is allocated [ 2048] bytes
[1236] - memory is allocated [ 2560] bytes
[1236] - memory is allocated [ 3072] bytes
...
查看内存使用情况
[root@ecs ~/tmp]# ps aux | grep a.out
root 1236 0.0 0.0 4476 616 pts/3 S+ 15:11 0:00 ./a.out
[root@ecs ~]# cat /proc/1236/status | grep RSS
VmRSS: 616 kB
top - 15:17:30 up 19 days, 1:44, 4 users, load average: 0.00, 0.02, 0.11
Tasks: 1 total, 0 running, 1 sleeping, 0 stopped, 0 zombie
%Cpu(s): 2.3 us, 1.0 sy, 0.0 ni, 96.6 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
KiB Mem : 32779804 total, 3203592 free, 2968776 used, 26607436 buff/cache
KiB Swap: 1048572 total, 1048572 free, 0 used. 28958612 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
1236 root 20 0 4476 616 424 S 0.0 0.0 0:00.01 ./a.out
增加ruletest组的内存限制64KB
[root@ecs ~/tmp]# mkdir /sys/fs/cgroup/memory/ruletest
[root@ecs ~/tmp]# cat /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes
9223372036854771712
[root@ecs ~/tmp]# echo 64k > /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes
[root@ecs ~/tmp]# cat /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes
65536
增加进程到ruletest组中
echo 1236 > /sys/fs/cgroup/memory/ruletest/tasks
限制不住!内存使用已经超过64K了才加到组里面!
[root@ecs ~/tmp]# ps aux | grep a.out
root 1236 0.0 0.0 4608 616 pts/3 S+ 15:11 0:00 ./a.out
重启程序,在没超过64K前加到组面
[1544] - memory is allocated [ 61440] bytes
[1544] - memory is allocated [ 61952] bytes
[1544] - memory is allocated [ 62464] bytes
[1544] - memory is allocated [ 62976] bytes
[1544] - memory is allocated [ 63488] bytes
[1544] - memory is allocated [ 64000] bytes
[1544] - memory is allocated [ 64512] bytes
[1544] - memory is allocated [ 65024] bytes
[1544] - memory is allocated [ 65536] bytes
[1544] - memory is allocated [ 66048] bytes
[1544] - memory is allocated [ 66560] bytes
[1544] - memory is allocated [ 67072] bytes
[1544] - memory is allocated [ 67584] bytes
[1544] - memory is allocated [ 68096] bytes
[1544] - memory is allocated [ 68608] bytes
[1544] - memory is allocated [ 69120] bytes
[1544] - memory is allocated [ 69632] bytes
[1544] - memory is allocated [ 70144] bytes
[1544] - memory is allocated [ 70656] bytes
[1544] - memory is allocated [ 71168] bytes
Killed
看下dmesg的结果
[root@ecs ~/tmp]# cat /var/log/messages
...
...
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: a.out invoked oom-killer: gfp_mask=0xd0, order=0, oom_score_adj=0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: a.out cpuset=/ mems_allowed=0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: CPU: 7 PID: 1544 Comm: a.out Tainted: G ------------ T 3.10.0-957.5.1.el7.x86_64 #1
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 99a222b 04/01/2014
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Call Trace:
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b61e41>] dump_stack+0x19/0x1b
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b5c86a>] dump_header+0x90/0x229
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25ba076>] ? find_lock_task_mm+0x56/0xc0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25ba524>] oom_kill_process+0x254/0x3d0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2635346>] mem_cgroup_oom_synchronize+0x546/0x570
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb26347c0>] ? mem_cgroup_charge_common+0xc0/0xc0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25badb4>] pagefault_out_of_memory+0x14/0x90
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b5ad72>] mm_fault_error+0x6a/0x157
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6f7a8>] __do_page_fault+0x3c8/0x500
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6f9c6>] trace_do_page_fault+0x56/0x150
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6ef42>] do_async_page_fault+0x22/0xf0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6b788>] async_page_fault+0x28/0x30
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Task in /ruletest killed as a result of limit of /ruletest
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: memory: usage 64kB, limit 64kB, failcnt 219
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: memory+swap: usage 64kB, limit 9007199254740988kB, failcnt 0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: kmem: usage 0kB, limit 9007199254740988kB, failcnt 0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Memory cgroup stats for /ruletest: cache:0KB rss:64KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:64KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [ 1544] 0 1544 1086 89 7 0 0 a.out
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Memory cgroup out of memory: Kill process 1544 (a.out) score 5875 or sacrifice child
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Killed process 1544 (a.out) total-vm:4344kB, anon-rss:76kB, file-rss:280kB, shmem-rss:0kB
1.3 IO限制(blkio.throttle.read_bps_device)
模拟大量磁盘IO
[root@ecs ~/tmp]# dd if=/dev/mapper/vgdata-lvdata of=/dev/null
观察iostat和iotop发现IO速度在60MB+
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
vdb 0.00 0.00 144.00 0.00 65536.00 0.00 910.22 0.87 7.19 7.19 0.00 1.16 16.70
dm-0 0.00 0.00 144.00 0.00 65536.00 0.00 910.22 1.03 7.19 7.19 0.00 1.82 26.20
Total DISK READ : 61.11 M/s | Total DISK WRITE : 0.00 B/s
Actual DISK READ: 61.11 M/s | Actual DISK WRITE: 0.00 B/s
TID PRIO USER DISK READ DISK WRITE SWAPIN IO> COMMAND
1714 be/4 root 61.11 M/s 0.00 B/s 0.00 % 0.98 % dd if=/dev/mapper/vgdata-lvdata of=/dev/null
使用ls -l /dev/mapper/vgdata-lvdata
获得块设备号
[root@ecs ~]# ll /dev/mapper/vgdata-lvdata
lrwxrwxrwx 1 root root 7 May 29 19:59 /dev/mapper/vgdata-lvdata -> ../dm-0
[root@ecs ~]# ll /dev/dm-0
brw-rw---- 1 root disk 252, 0 May 29 19:59 /dev/dm-0
创建IO的cgroup,进程添加到group中
[root@ecs ~]# echo '252:0 1048576' > /sys/fs/cgroup/blkio/ruletest/blkio.throttle.read_bps_device
[root@ecs ~]# cat /sys/fs/cgroup/blkio/ruletest/blkio.throttle.read_bps_device
252:0 1048576
[root@ecs ~]# echo 1714 > /sys/fs/cgroup/blkio/ruletest/tasks
[root@ecs ~]# cat /sys/fs/cgroup/blkio/ruletest/tasks
1714
观察iostat -x 1发现IO被限制到了1MB/s左右
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
vdb 0.00 0.00 2.00 0.00 1008.00 0.00 1008.00 0.00 1.00 1.00 0.00 1.00 0.20
dm-0 0.00 0.00 2.00 0.00 1008.00 0.00 1008.00 0.00 1.00 1.00 0.00 1.00 0.20
2 CGROUP内部
2.1 子系统
好了,有了以上的感性认识我们来,我们来看看control group有哪些子系统:
- blkio — 这个子系统为块设备设定输入/输出限制,比如物理设备(磁盘,固态硬盘,USB 等等)。
- cpu — 这个子系统使用调度程序提供对 CPU 的 cgroup 任务访问。
- cpuacct — 这个子系统自动生成 cgroup 中任务所使用的 CPU 报告。
- cpuset — 这个子系统为 cgroup 中的任务分配独立 CPU(在多核系统)和内存节点。
- devices — 这个子系统可允许或者拒绝 cgroup 中的任务访问设备。
- freezer — 这个子系统挂起或者恢复 cgroup 中的任务。
- memory — 这个子系统设定 cgroup 中任务使用的内存限制,并自动生成内存资源使用报告。
- net_cls — 这个子系统使用等级识别符(classid)标记网络数据包,可允许 Linux 流量控制程序(tc)识别从具体 cgroup 中生成的数据包。
- net_prio — 这个子系统用来设计网络流量的优先级
- hugetlb — 这个子系统主要针对于HugeTLB系统进行限制,这是一个大页文件系统。
2.2 术语
- 任务(Tasks):就是系统的一个进程。
- 控制组(Control Group):一组按照某种标准划分的进程,比如官方文档中的Professor和Student,或是WWW和System之类的,其表示了某进程组。Cgroups中的资源控制都是以控制组为单位实现。一个进程可以加入到某个控制组。而资源的限制是定义在这个组上,就像上面示例中我用的haoel一样。简单点说,cgroup的呈现就是一个目录带一系列的可配置文件。
层级(Hierarchy):控制组可以组织成hierarchical的形式,既一颗控制组的树(目录结构)。控制组树上的子节点继承父结点的属性。简单点说,hierarchy就是在一个或多个子系统上的cgroups目录树。 - 子系统(Subsystem):一个子系统就是一个资源控制器,比如CPU子系统就是控制CPU时间分配的一个控制器。子系统必须附加到一个层级上才能起作用,一个子系统附加到某个层级以后,这个层级上的所有控制族群都受到这个子系统的控制。Cgroup的子系统可以有很多,也在不断增加中。
3 最佳实践
整理补充
4 CFS
4.1 cpu usage的计算公式(错误的)
cat cpuacct.usage;cat cpu.stat| grep nr_periods;sleep 60;cat cpuacct.usage;cat cpu.stat| grep nr_periods;cat cpu.cfs_period_us
36161906134505
nr_periods 1201535
36162489738476
nr_periods 1201613
50000
$(36162489738476 – 36161906134505) / (1201613 – 1201535) / 50000 / 1000 * 100$
4.2 cpu usage的计算公式(systemd-cgtop代码中总结)
cpuacct.usage差值 / 真实时间差值(纳秒)
相关代码Cgtop.c
} else {
if (!streq(controller, "cpuacct"))
return 0;
r = cg_get_path(controller, path, "cpuacct.usage", &p);
if (r < 0)
return r;
r = read_one_line_file(p, &v);
if (r == -ENOENT)
return 0;
if (r < 0)
return r;
r = safe_atou64(v, &new_usage);
if (r < 0)
return r;
}
timestamp = now_nsec(CLOCK_MONOTONIC);
if (g->cpu_iteration == iteration - 1 &&
(nsec_t) new_usage > g->cpu_usage) {
nsec_t x, y;
x = timestamp - g->cpu_timestamp;
if (x < 1)
x = 1;
y = (nsec_t) new_usage - g->cpu_usage;
// y = cpuacct.usage 差值
// x = now_nsec得出的真实时间差值
g->cpu_fraction = (double) y / (double) x;
g->cpu_valid = true;
}
g->cpu_usage = (nsec_t) new_usage;
g->cpu_timestamp = timestamp;
g->cpu_iteration = iteration;
}
4.2 CFS中核心数的分配方式
cpu.cfs_period_us
:定义的周期长度
cpu.cfs_quota_us
:在定义的周期基础上,分配多上时间
例如
cpu.cfs_period_us = 50000
cpu.cfs_quota_us = 200000
表示一个周期为50ms,使用的上限上一个周期你可以用200ms,也就是你可以用4个核心的资源。