-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathcgroup.c
336 lines (288 loc) · 8.89 KB
/
cgroup.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <ftw.h>
#include <libgen.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <sys/epoll.h>
#include <syslog.h>
#include <unistd.h>
#include "cgroup.h"
#include "config.h"
#include "errutil.h"
#include "fd.h"
#include "path.h"
#include "util.h"
extern const struct cgroup_driver_funcs cgroup_driver_none;
extern const struct cgroup_driver_funcs cgroup_driver_native;
#ifdef HAVE_SYSTEMD
extern const struct cgroup_driver_funcs cgroup_driver_systemd;
#endif
static const struct cgroup_driver_funcs *cgroup_drivers[] = {
[CGROUP_DRIVER_NONE] = &cgroup_driver_none,
[CGROUP_DRIVER_NATIVE] = &cgroup_driver_native,
#ifdef HAVE_SYSTEMD
[CGROUP_DRIVER_SYSTEMD] = &cgroup_driver_systemd,
#endif
};
static enum cgroup_driver cgroup_detected_driver = -1;
int cgroup_driver_init(enum cgroup_driver driver, bool fatal)
{
cgroup_detected_driver = driver;
if (cgroup_detected_driver != (enum cgroup_driver)-1) {
if (cgroup_detected_driver < 0 || cgroup_detected_driver >= lengthof(cgroup_drivers)) {
errx(1, "unknown cgroup driver ID %d", cgroup_detected_driver);
}
int rc = cgroup_drivers[cgroup_detected_driver]->init(fatal);
if (rc < 0 && fatal) {
errx(1, "cgroup_driver_init: cgroup driver failed to initialize");
}
return rc;
}
static enum cgroup_driver attempts[] = {
#ifdef HAVE_SYSTEMD
CGROUP_DRIVER_SYSTEMD,
#endif
CGROUP_DRIVER_NATIVE,
};
for (size_t i = 0; i < lengthof(attempts); i++) {
if (attempts[i] < 0 || attempts[i] >= lengthof(cgroup_drivers)) {
errx(1, "cgroup_driver_init: programming error: unexpected cgroup driver ID %d", cgroup_detected_driver);
}
if (cgroup_drivers[attempts[i]]->init(false) >= 0) {
cgroup_detected_driver = attempts[i];
return 0;
}
}
if (fatal) {
errx(1, "cgroup_driver_init: no cgroup driver initialized successfully");
}
return -1;
}
int cgroup_join(const char *parent, const char *name)
{
return cgroup_drivers[cgroup_detected_driver]->join_cgroup(parent, name);
}
bool cgroup_current_path(char *path)
{
return cgroup_drivers[cgroup_detected_driver]->current_path(path);
}
bool cgroup_read_current(int procfd, char *path)
{
FILE *selfcgroupfd;
if (procfd == -1) {
selfcgroupfd = fopen("/proc/self/cgroup", "r");
if (!selfcgroupfd) {
err(1, "unable to derive current cgroup hierarchy from /proc/self/cgroup");
}
} else {
int fd = openat(procfd, "cgroup", O_RDONLY | O_CLOEXEC);
if (fd == -1) {
err(1, "unable to derive current cgroup hierarchy from /proc/self/cgroup");
}
selfcgroupfd = fdopen(fd, "r");
if (!selfcgroupfd) {
err(1, "fdopen /proc/self/cgroup");
}
}
const char *selfcgroup = NULL;
char line[BUFSIZ];
while (fgets(line, sizeof (line), selfcgroupfd) != NULL) {
if (strncmp(line, "0::/", sizeof ("0::/") - 1) == 0) {
// Remove newline character read by fgets
line[strcspn(line, "\n")] = '\0';
selfcgroup = line + 3;
break;
}
}
fclose(selfcgroupfd);
if (selfcgroup != NULL && path != NULL) {
makepath_r(path, "/sys/fs/cgroup/%s", selfcgroup);
}
return selfcgroup != NULL;
}
static int rm_cgroup(const char *fpath, const struct stat *sb, int tflag, struct FTW *ftwbuf)
{
char path[PATH_MAX];
strncpy(path, fpath, sizeof (path));
if (tflag == FTW_D) {
for (int level = ftwbuf->level; level >= 0; level--) {
if (rmdir(path) == -1) {
break;
}
dirname(path);
}
}
return 0;
}
/* If bst has entered a cgroup this function will epoll the cgroup.events file
to detect when all pids have exited the cgroup ("populated 0"). The cgroup is
destroyed when this condition is met. */
static void run_cleaner_child(int lock, int parentfd, const char *name)
{
/* Wait for the parent to die before proceeding */
int ok;
switch (read(lock, &ok, sizeof (ok))) {
case -1:
warn("run_cgroup_child: read on lock");
goto lastDitchEffort;
case 0:
break;
}
int cgroupfd = openat(parentfd, name, O_RDONLY | O_DIRECTORY, 0);
if (cgroupfd == -1) {
switch (errno) {
case ENOENT:
/* Parent died before it made the cgroup; nothing to do */
return;
default:
warn("run_cgroup_child: open %s", name);
goto lastDitchEffort;
}
}
char fdpath[PATH_MAX];
makepath_r(fdpath, "/proc/self/fd/%d", cgroupfd);
char cgroup_path[PATH_MAX];
if (readlink(fdpath, cgroup_path, sizeof (cgroup_path)) == -1) {
warn("run_cgroup_child: readlink");
goto lastDitchEffort;
}
int eventfd = openat(cgroupfd, "cgroup.events", 0);
if (eventfd == -1) {
warn("run_cgroup_child: open cgroup.events");
goto recursiveClean;
}
struct epoll_event event = {
.events = 0,
};
int epollfd = epoll_create1(0);
if (epollfd == -1) {
warn("run_cgroup_child: epoll_create1");
goto recursiveClean;
}
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, eventfd, &event) == -1) {
warn("run_cgroup_child: epoll_ctl_add cgroupfd");
goto recursiveClean;
}
/* The first event is the initial state of the file; skip it, because
at that point the cgroup is still empty, and we'll have populated 0 */
epoll_wait(epollfd, &event, 1, -1);
FILE *eventsfp = fdopen(eventfd, "r");
if (eventsfp == NULL) {
warn("run_cgroup_child: fdopen cgroup.events");
goto recursiveClean;
}
char populated[BUFSIZ];
for (;;) {
int ready = epoll_wait(epollfd, &event, 1, -1);
if (ready == -1 && errno != EINTR) {
warn("run_cgroup_child: epoll_wait cgroup.events");
goto recursiveClean;
}
rewind(eventsfp);
/* The order of elements in cgroup.events is not necessarily specified. */
while (fgets(populated, BUFSIZ, eventsfp) != NULL) {
if (strnlen(populated, sizeof(populated)) == sizeof(populated)) {
warn("run_cgroup_child: exceeded cgroup.events line read buffer");
goto recursiveClean;
}
if (strncmp(populated, "populated 0", 11) == 0) {
goto recursiveClean;
}
}
}
/* Let the process exit; no need to clean up fds. We don't need to
set any exit code since no parent process cares about them. */
recursiveClean:
nftw(cgroup_path, rm_cgroup, 128, 0);
return;
lastDitchEffort:
if (unlinkat(parentfd, name, AT_REMOVEDIR) == -1) {
warn("run_cgroup_child: unlinkat");
}
return;
}
void cgroup_start_cleaner(int parentfd, const char *name)
{
int fds[2];
if (pipe2(fds, O_CLOEXEC) == -1) {
err(1, "cgroup_start_cleaner: pipe2");
}
pid_t pid = fork();
if (pid == -1) {
err(1, "cgroup_start_cleaner: fork");
}
/* This process is intentionally left to leak as the bst root process must have exited
and thus been removed from bst's cgroup.procs for the cgroup hierarchy to be removed */
if (pid == 0) {
/* Create a new session in case current group leader is killed */
if (setsid() == -1) {
err(1, "cgroup_start_cleaner: setsid");
}
/* Make sure all file descriptors except for the ones we're actually using
get closed. This avoids keeping around file descriptors on which
the parent process might be waiting on. */
rebind_fds_and_close_rest(3, &fds[0], &parentfd, NULL);
/* From now on, use syslog to report error messages. This is necessary
since the parent bst process might be gone by the time there's an
error, and whatever started it might not be there to report the
error anymore. */
openlog("bst", LOG_CONS | LOG_PID, LOG_USER);
err_flags |= ERR_USE_SYSLOG;
run_cleaner_child(fds[0], parentfd, name);
_exit(0);
}
close(fds[0]);
/* Deliberately leak fds[1]. This _is_ important. It will get closed
once this process dies, releasing the read(2) lock of the cgroup
cleaner. */
}
void cgroup_enable_controllers(int cgroupfd)
{
char controllers[BUFSIZ] = {0};
int cfd = openat(cgroupfd, "cgroup.controllers", O_RDONLY, 0);
if (cfd == -1) {
err(1, "cgroup_enable_controllers: open cgroup.controllers");
}
if (read(cfd, controllers, sizeof (controllers)) == sizeof (controllers)) {
errx(1, "cgroup_enable_controllers: read cgroup.controllers: too many controllers");
}
if (close(cfd) == -1) {
err(1, "cgroup_enable_controllers: close cgroup.controllers");
}
int scfd = openat(cgroupfd, "cgroup.subtree_control", O_WRONLY, 0);
if (scfd == -1) {
err(1, "cgroup_enable_controllers: open cgroup.subtree_control");
}
char buf[BUFSIZ];
buf[0] = '+';
for (char *controller = strtok(controllers, " "); controller != NULL; controller = strtok(NULL, " ")) {
char *last = stpncpy(buf + 1, controller, sizeof (buf) - 1);
size_t len = last - buf;
if (write(scfd, buf, len) == (ssize_t)-1) {
err(1, "cgroup_enable_controllers: write %s into cgroup.subtree_control", buf);
}
}
if (close(scfd) == -1) {
err(1, "cgroup_enable_controllers: close cgroup.subtree_control");
}
}
static int cgroup_none_driver_init(bool fatal)
{
return -1;
}
static bool cgroup_none_current_path(char *path)
{
return false;
}
static int cgroup_none_join_cgroup(const char *parent, const char *name)
{
return -1;
}
const struct cgroup_driver_funcs cgroup_driver_none = {
.init = cgroup_none_driver_init,
.join_cgroup = cgroup_none_join_cgroup,
.current_path = cgroup_none_current_path,
};