记一次Web服务器lstat系统调用严重故障分析

环境说明

1
2
3
Centos6.5 x64
Tengine-2.1.0
PHP-5.5.30

随着业务流量增加,根据监控发现系统%sys的CPU占用率一度超过40%~50%(业务高峰期更为严重),
由于系统跑的是PHP,Nginx服务没有其它额外服务,PHP又容易成为系统的瓶颈,故使用strace跟踪了php-fpm进程的调用,如图:

看现象

通过top命令查看实时状态,%sys内核态CPU占比非常高,负载较高

通过strace跟踪php-fpm进程运行时的系统调用 strace -c -p $(pgrep -n php-fpm)

lstat调用到底干什么的呢?manlstat
manlstatte

分析问题

根据上面的现象可以看到lstat调用占用绝大部份的内核态CPU时间,可以通过strace跟踪php-fpm详细的lstat调用规律

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
strace  -o strace.out -r -s 256 $(pgrep -n php-fpm)
0.000051 getcwd("/data/wwwroot/run/platv3/public", 4096) = 43
0.000037 lstat("/data/wwwroot/run/platv3/public/../application/config/production/frontend_config.php", 0x7fffbe160260) = -1 ENOENT (No such file or directory)
0.000055 lstat("/data/wwwroot/run/platv3/public/../application/config/production", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000058 lstat("/data/wwwroot/run/platv3/public/../application/config", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000054 lstat("/data/wwwroot/run/platv3/public/../application", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000052 lstat("/data/wwwroot/run/platv3/public", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000050 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000058 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000046 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000043 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 lstat("/data/wwwroot/run/platv3/application/config/production/frontend_config.php", 0x7fffbe160270) = -1 ENOENT (No such file or directory)
0.000050 readlink("/data/wwwroot/run/platv3/application/config/production/frontend_config.php", 0x7fffbe162410, 4095) = -1 ENOENT (No such file or directory)
0.000077 lstat("/data/wwwroot/run/platv3/application/config/production", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000059 lstat("/data/wwwroot/run/platv3/application/config", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000052 lstat("/data/wwwroot/run/platv3/application", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000050 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000047 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000070 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000046 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000046 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000047 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000044 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 access("../application/config/production/frontend_config.php", F_OK) = -1 ENOENT (No such file or directory)
0.000051 getcwd("/data/wwwroot/run/platv3/public", 4096) = 43
0.000038 getcwd("/data/wwwroot/run/platv3/public", 4096) = 43
0.000036 lstat("/data/wwwroot/run/platv3/public/../application/config/frontend_config.php", {st_mode=S_IFREG|0775, st_size=1556, ...}) = 0
0.000069 lstat("/data/wwwroot/run/platv3/public/../application/config", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000057 lstat("/data/wwwroot/run/platv3/public/../application", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000053 lstat("/data/wwwroot/run/platv3/public", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000052 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000049 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000075 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000046 lstat("/data/wwwroot/run/platv3/application/config/frontend_config.php", {stG|0775, st_size=9882, ...}) = 0
0.000056 lstat("/data/wwwroot/run/platv3/application/libraries", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000052 lstat("/data/wwwroot/run/platv3/application", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000049 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000048 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000043 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000046 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000046 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000043 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000111 access("../application/libraries/X_Session.php", F_OK) = 0
0.000071 getcwd("/data/wwwroot/run/platv3/public", 4096) = 43
0.000043 lstat("/data/wwwroot/run/platv3/system/libraries/Session.php", {st_mode=S_IFREG|0775, st_size=19266, ...}) = 0
0.000060 lstat("/data/wwwroot/run/platv3/system/libraries", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000051 lstat("/data/wwwroot/run/platv3/system", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000050 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000049 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000055 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 lstat("/data/wwwroot/run/platv3/system/libraries/Session.php", {st_mode=S_IFREG|0775, st_size=19266, ...}) = 0
0.000055 lstat("/data/wwwroot/run/platv3/system/libraries", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000053 lstat("/data/wwwroot/run/platv3/system", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000052 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000053 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000047 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000051 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000047 lstat("/data/wwwroot/run/platv3/public", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000058 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000114 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000048 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000043 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000046 lstat("/data/wwwroot/run/platv3/system/libraries/Session.php", {st_mode=S_IFREG|0775, st_size=19266, ...}) = 0
0.000055 lstat("/data/wwwroot/run/platv3/system/libraries", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000052 lstat("/data/wwwroot/run/platv3/system", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000050 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000061 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000051 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000044 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 lstat("/data/wwwroot/run/platv3/system/libraries/Session.php", {st_mode=S_IFREG|0775, st_size=19266, ...}) = 0
0.000055 lstat("/data/wwwroot/run/platv3/system/libraries", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000053 lstat("/data/wwwroot/run/platv3/system", {st_mode=S_IFDIR|0775, st_size=4096, ...}) = 0
0.000049 lstat("/data/wwwroot/run/platv3", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000050 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000046 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000045 lstat("/data/wwwroot/run", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000120 lstat("/data/wwwroot", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
0.000055 lstat("/data", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0

由上面的信息不难看到程序在使用include,require,require_once,include_once,fopen,gzopen等开启了open_basedir之后,lstat调用会递归访问的文件目录查看文件位置限制,文件在open_basedir指定的目录以外时将拒绝打开,所有符号链接也会被解析无法避免此限制 ,这样就导致了lstat()调用比不开启open_basedir调用频率高很多;另外include,require,require_once,include_once在包含相对路径时,如果代码中存在大量的这样的语句,每次都从include_path中查找相应的文件,也会造成性能问题,所以通常用 realpath_cache_size 和realpath_cache_ttl来对文件的realpath进行缓存,但是开启了open_basedir之后,这个缓存是失效的;查看了官方的bug:https://bugs.php.net/bug.php?id=52312 有记录并没有修复,估计是考虑到安全问题

验证问题

大概了解了问题原因之后,对症下药来做测试,按三种情况测试下性能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
1.开启open_basedir
Disable PHP's open_basedir directive so that the realpath cache won't be
; disabled.
; Remember, turbo_realpath will set this option later to the
; realpath_cache_basedir value.
open_basedir = "/data/wwwroot/run/:/tmp/"
2.使用realpath_turbo进行优化,开启open_basedir时,依然可以使用realpath cache;
安装realpath_turbo扩展https://github.com/Whissi/realpath_turbo,修改php.ini
#you have to load the extension first
extension=turbo_realpath.so
; realpath_turbo security mode
; Possible values:
; 0 - Ignore potential security issues
; 1 - Disable dangerous PHP functions (link,symlink)
realpath_cache_security = 1
; Set realpath_cache_basedir to whatever you want to set open_basedir to
realpath_cache_basedir = "/data/wwwroot/run/:/tmp/"
; Disable PHP's open_basedir directive so that the realpath cache won't be
; disabled.
; Remember, turbo_realpath will set this option later to the
; realpath_cache_basedir value.
open_basedir = ""
realpath_cache_size = 20m
; Duration of time, in seconds for which to cache realpath information for a given
; file or directory. For systems with rarely changing files, consider increasing this
; value.
; http://php.net/realpath-cache-ttl
realpath_cache_ttl = 43200
注意事项:使用realpath_turbo时,要关闭创建和操作符号链接的函数,否则会绕过 open_basedir安全限制,但依然不推荐在多虚拟主机环境使用
3.关闭open_basedir,增加realpath缓存
realpath_cache_size = 20m
; Duration of time, in seconds for which to cache realpath information for a given
; file or directory. For systems with rarely changing files, consider increasing this
; value.
; http://php.net/realpath-cache-ttl
realpath_cache_ttl = 43200

写脚本分别采集三种环境下系统调用占CPU时间百分比(采集半小时)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#*_* coding:utf-8 *_*
import time
import psutil
import os
end=1
f = open('openbasedir.log',mode='w')
f.write("%s\t\t%s\t\t\t%s\t\t%s\t\t%s" % ('id','time','sys_percent','hi','si') + "\n")
while end >= 1800:
nowtime=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
ps = psutil.cpu_times_percent(interval=1)
data="%s\t\t%s\t\t%s\t\t%s\t\t%s" % (end,nowtime,ps.system,ps.irq,ps.softirq)
f.write(data+os.linesep)
end +=1
最终性能对比结果如图(横轴为采样id(s),纵轴为%sys占CPU时间比):

tongji

结论

为了性能暂时关闭了open_basedir参数,开启realpath cache缓存,代码尽量少用include,require,include_once,require_once等

坚持原创分享,您的支持将鼓励我继续创作