https://github.com/lxc/lxc
lxc_start.c
int main(int argc, char *argv[])
{
int err = 1;
struct lxc_conf *conf; //初始化config结构
char *const *args; //传递的参数
char *rcfile = NULL; //指定配置文件
char *const default_args[] = { //默认的args参数
“/sbin/init”,
NULL,
};
struct lxc_container *c; //lxc-container 的结构体
….
}
lxc_conf这个数据结构
struct lxc_conf {
int is_execute; //容器是否在执行
char *fstab; //fstab?
int tty; //tty的个数
int pts; //pts的个数?
int reboot; //重启?
int need_utmp_watch; //字面翻译 需要utmp 查看
signed long personality; //字面翻译 特点
struct utsname *utsname; //ustname
struct lxc_list cgroup; //cgroup list lxc_list只是简单的链表结构
struct lxc_list id_map; //id_map list
struct lxc_list network; //network list
struct saved_nic *saved_nics;//saved_nics 结构
int num_savednics; //savednics数量?
int auto_mounts; //auto_mounts?
struct lxc_list mount_list; //mount_list list?
struct lxc_list caps; //caps list?
struct lxc_list keepcaps; //keepcaps list?
struct lxc_tty_info tty_info; //tty的相关信息
struct lxc_console console; //console的结构体
struct lxc_rootfs rootfs; //rootfs的结构体
char *ttydir; //tty目录
int close_all_fds; //关闭所有fd
struct lxc_list hooks[NUM_LXC_HOOKS]; //hooks 函数
char *lsm_aa_profile; //?
char *lsm_se_context; //?
int tmp_umount_proc; //?
char *seccomp; // filename with the seccomp rules #if HAVE_SCMP_FILTER_CTX
scmp_filter_ctx seccomp_ctx; #endif
int maincmd_fd; //?
int autodev; // if 1, mount and fill a /dev at start
int haltsignal; // signal used to halt container
int stopsignal; // signal used to hard stop container
int kmsg; // if 1, create /dev/kmsg symlink
char *rcfile; // Copy of the top level rcfile we read
// Logfile and loglevel can be set in a container config file.
// Those function as defaults. The defaults can be overriden
// by command line. However we don't want the command line
// specified values to be saved on c->save_config(). So we
// store the config file specified values here.
char *logfile; // the logfile as specifed in config
int loglevel; // loglevel as specifed in config (if any)
int inherit_ns_fd[LXC_NS_MAX];
int start_auto;
int start_delay;
int start_order;
struct lxc_list groups;
int nbd_idx;
/* set to true when rootfs has been setup */
bool rootfs_setup; };
lxc_container的结构体
/*!
An LXC container.
*/
struct lxc_container {
// private fields
char *name; //container 的名字
char *configfile; // configuration file 的路径
char *pidfile; // 存储pid 的文件名
struct lxc_lock *slock; //Container semaphore lock. 容器的信号锁
struct lxc_lock *privlock;//容器的私有信号锁
int numthreads; //容器的引用数量,由privlock保护
struct lxc_conf *lxc_conf;
// public fields
char *error_string; //全局变量 可读的最后显示的error
int error_num; //最后error的数字
bool daemonize; //容器是否希望开启守护进程
char *config_path; // configuration file 的路径 和上面的区别? 全局?
……. //一堆成员函数 暂不看
}
lxc_list_init(&defines); //初始化list
defines定义在文件开始,为全局变量
static structlxc_list defines;
if(lxc_caps_init()) //caps初始化
return err;
到这个函数里看一下。
int lxc_caps_init(void)
{
uid_t uid = getuid();
gid_t gid = getgid();
uid_t euid = geteuid(); //有效uid
if (!uid) { //root权限运行的话就省了后面的步骤了
INFO("command is run as 'root'");
return 0;
}
if (uid && !euid) {
INFO("command is run as setuid root (uid : %d)", uid);
if (prctl(PR_SET_KEEPCAPS, 1)) { //prctl 设置进程的选项,为下面set?
ERROR("failed to 'PR_SET_KEEPCAPS': %m");
return -1;
}
if (setresgid(gid, gid, gid)) {
ERROR("failed to change gid to '%d': %m", gid);
return -1;
}
if (setresuid(uid, uid, uid)) {
ERROR("failed to change uid to '%d': %m", uid);
return -1;
}
if (lxc_caps_up()) {
ERROR("failed to restore capabilities: %m");
return -1;
}
}
if (uid == euid)
INFO(“command is run as user ‘%d’”, uid);
return 0;
}
接着就是读传过来的参数
if(lxc_arguments_parse(&my_args, argc, argv))
return err;
这个函数就没细看,只需知道将参数传给my_args
判断有没有指定 初始执行的参数,没有的话指定默认参数
if (!my_args.argc)
args = default_args;
else
args = my_args.argv;
初始化一堆log的,暂时也没细看
if (lxc_log_init(my_args.name, my_args.log_file, my_args.log_priority,
my_args.progname, my_args.quiet, my_args.lxcpath[0]))
return err;
lxc_log_options_no_override();
const char lxcpath = my_args.lxcpath[0]; //lxcpath 很有意思
// lxc_global_config_value(“lxc.lxcpath”)这个写的还是比较复杂的,总之lxcpath会是默认的路径
//指定config的位置,如果没指定,则使用默认的路径的config,通过配置创建新的
/
* rcfile possibilities:
* 1. rcfile from random path specified in cli option
* 2. rcfile not specified, use $lxcpath/$lxcname/config
* 3. rcfile not specified and does not exist.
/
/ rcfile is specified in the cli option */
if (my_args.rcfile) {
rcfile = (char *)my_args.rcfile;
c = lxc_container_new(my_args.name, lxcpath);
if (!c) {
ERROR(“Failed to create lxc_container”);
return err;
}
c->clear_config(c);
if (!c->load_config(c, rcfile)) {
ERROR(“Failed to load rcfile”);
lxc_container_put(c);
return err;
}
}
} else {
int rc;
rc = asprintf(&rcfile, "%s/%s/config", lxcpath, my_args.name);
if (rc == -1) {
SYSERROR("failed to allocate memory");
return err;
}
INFO("using rcfile %s", rcfile);
/* container configuration does not exist */
if (access(rcfile, F_OK)) {
free(rcfile);
rcfile = NULL;
}
c = lxc_container_new(my_args.name, lxcpath);
if (!c) {
ERROR("Failed to create lxc_container");
return err;
} }
里面最主要的函数c = lxc_container_new(my_args.name, lxcpath);
struct lxc_container *lxc_container_new(const char *name, const char *configpath)
{
struct lxc_container *c; //结构体lxc_container 前面分析过了
c = malloc(sizeof(*c)); //创建
if (!c) {
fprintf(stderr, "failed to malloc lxc_container\n");
return NULL;
}
memset(c, 0, sizeof(*c)); //初始0
if (configpath)
c->config_path = strdup(configpath); //config_path
else
c->config_path = strdup(lxc_global_config_value("lxc.lxcpath"));
if (!c->config_path) {
fprintf(stderr, "Out of memory\n");
goto err;
}
remove_trailing_slashes(c->config_path);
c->name = malloc(strlen(name)+1);
if (!c->name) {
fprintf(stderr, "Error allocating lxc_container name\n");
goto err;
}
strcpy(c->name, name);
c->numthreads = 1;
// lock这部分没细看
if (!(c->slock = lxc_newlock(c->config_path, name))) {
fprintf(stderr, "failed to create lock\n");
goto err;
}
if (!(c->privlock = lxc_newlock(NULL, NULL))) {
fprintf(stderr, "failed to alloc privlock\n");
goto err;
}
// set config path
if (!set_config_filename(c)) {
fprintf(stderr, "Error allocating config file pathname\n");
goto err;
}
//load config path
if (file_exists(c->configfile) && !lxcapi_load_config(c, NULL))
goto err;
//判断容器是否创建失败
if (ongoing_create(c) == 2) {
ERROR("Error: %s creation was not completed", c->name);
lxcapi_destroy(c);
lxcapi_clear_config(c);
}
c->daemonize = true;
c->pidfile = NULL;
…… //后面都是成员函数赋值 } 现在回到lxc_start 的main函数中 //判断容器是否在运行 if (c->is_running(c)) {
ERROR("Container is already running.");
err = 0;
goto out; } /* * We should use set_config_item() over &defines, which would handle * unset c->lxc_conf for us and let us not use lxc_config_define_load() */ //加载config文件 if (!c->lxc_conf)
c->lxc_conf = lxc_conf_init(); conf = c->lxc_conf; if (lxc_config_define_load(&defines, conf))
goto out; //提示信息 if (!rcfile && !strcmp("/sbin/init", args[0])) {
ERROR("Executing '/sbin/init' with no configuration file may crash the host");
goto out;
}
if (ensure_path(&conf->console.path, my_args.console) < 0) {
ERROR("failed to ensure console path '%s'", my_args.console);
goto out;
}
if (ensure_path(&conf->console.log_path, my_args.console_log) < 0) {
ERROR("failed to ensure console log '%s'", my_args.console_log);
goto out;
}
// pid 文件
if (my_args.pidfile != NULL) {
if (ensure_path(&c->pidfile, my_args.pidfile) < 0) {
ERROR("failed to ensure pidfile '%s'", my_args.pidfile);
goto out;
}
}
//一些share_ns 的配置,未细看
int i;
for (i = 0; i < LXC_NS_MAX; i++) {
if (my_args.share_ns[i] == NULL)
continue;
int pid = pid_from_lxcname(my_args.share_ns[i], lxcpath);
if (pid < 1)
goto out;
int fd = open_ns(pid, ns_info[i].proc_name);
if (fd < 0)
goto out;
conf->inherit_ns_fd[i] = fd;
}
//初始化为1
if (!my_args.daemonize) {
c->want_daemonize(c, false);
}
if (my_args.close_all_fds)
c->want_close_all_fds(c, true);
err = c->start(c, 0, args) ? 0 : 1;
if (err) {
ERROR("The container failed to start.");
if (my_args.daemonize)
ERROR("To get more details, run the container in foreground mode.");
ERROR("Additional information can be obtained by setting the "
"--logfile and --logpriority options.");
err = c->error_num;
lxc_container_put(c);
return err;
}
out:
lxc_container_put(c);
return err;
}
直接到c->start 过程start是调用 lxcapi_start 这个函数指针,现在去看下这个函数到底是怎么讲lxc container 启动起来的。
传过来的参数是container c,useinit 0,argv=args 即指定的初始化程序
static bool lxcapi_start(struct lxc_container *c, int useinit, char * const argv[])
{
int ret;
struct lxc_conf *conf;
bool daemonize = false; //守护进程为false
FILE *pid_fp = NULL; //pid_file文件的指针
char *default_args[] = { //又是default_args
“/sbin/init”,
NULL,
};
/* container exists */
if (!c) //判断容器是否存在
return false;
/* container has been setup */
if (!c->lxc_conf) //config加载完美
return false;
if ((ret = ongoing_create(c)) < 0) { //容器是否创建完整
ERROR("Error checking for incomplete creation");
return false;
}
if (ret == 2) {
ERROR("Error: %s creation was not completed", c->name);
c->destroy(c);
return false;
} else if (ret == 1) {
ERROR("Error: creation of %s is ongoing", c->name);
return false;
}
/* is this app meant to be run through lxcinit, as in lxc-execute? */
if (useinit && !argv) //还是判断
return false;
if (container_mem_lock(c)) //lock
return false;
conf = c->lxc_conf; //conf赋值
daemonize = c->daemonize; //true
container_mem_unlock(c); //unlock
if (useinit) { //0
ret = lxc_execute(c->name, argv, 1, conf, c->config_path);
return ret == 0 ? true : false;
}
if (!argv)
argv = default_args; //又重新判断 args 是否为空,空即赋值
/*
* say, I'm not sure - what locks do we want here? Any?
* Is liblxc's locking enough here to protect the on disk
* container? We don't want to exclude things like lxc_info
* while container is running...
* 这段注释给跪了,还是老老实实看他想干嘛吧
*/
if (daemonize) { //true
lxc_monitord_spawn(c->config_path); //start好像跟前面的版本差别
pid_t pid = fork();
if (pid < 0)
return false;
if (pid != 0) {
/* Set to NULL because we don't want father unlink
* the PID file, child will do the free and unlink.
*/
c->pidfile = NULL;
return wait_on_daemonized_start(c, pid); //等下进去,里面有waitpid,所以先看后面
}
/* second fork to be reparented by init */
pid = fork(); //两次fork
if (pid < 0) {
SYSERROR("Error doing dual-fork");
return false;
}
if (pid != 0)
exit(0);
/* like daemon(), chdir to / and redirect 0,1,2 to /dev/null */
if (chdir("/")) { //root目录
SYSERROR("Error chdir()ing to /.");
return false;
}
lxc_check_inherited(conf, -1);
close(0); //pipe file? close(1);
close(2);
open("/dev/zero", O_RDONLY);
open("/dev/null", O_RDWR);
open("/dev/null", O_RDWR);
setsid();
} else {
if (!am_single_threaded()) {
ERROR("Cannot start non-daemonized container when threaded");
return false;
}
} /* We need to write PID file after daeminize, so we always
* write the right PID.
*/
if (c->pidfile) { //写入pid 到pidfile
pid_fp = fopen(c->pidfile, "w");
if (pid_fp == NULL) {
SYSERROR("Failed to create pidfile '%s' for '%s'",
c->pidfile, c->name);
return false;
}
if (fprintf(pid_fp, "%d\n", getpid()) < 0) {
SYSERROR("Failed to write '%s'", c->pidfile);
fclose(pid_fp);
pid_fp = NULL;
return false;
}
fclose(pid_fp);
pid_fp = NULL;
}
reboot:
…..
}
现在到 wait_on_daemonized_start(c, pid) 里面看看函数调用的情况
这个就是主线程的pid 在等待其他子线程工作完,然后执行,只能硬着头皮继续看了。
static bool wait_on_daemonized_start(struct lxc_container c, int pid)
{
/ we’ll probably want to make this timeout configurable? */
int timeout = 5, ret, status;
/*
* our child is going to fork again, then exit. reap the
* child
*/
ret = waitpid(pid, &status, 0);
if (ret == -1 || !WIFEXITED(status) || WEXITSTATUS(status) != 0)
DEBUG("failed waiting for first dual-fork child");
return lxcapi_wait(c, "RUNNING", timeout); } 函数很简单 直接调用了lxcapi_wait。
static bool lxcapi_wait(struct lxc_container *c, const char *state, int timeout)
{
int ret;
if (!c)
return false;
ret = lxc_wait(c->name, state, timeout, c->config_path);
return ret == 0; }
这个依旧很简单又跳走了。。。lxc_wait了
这个函数现在先不细说了,只是检查容器创建是否超时的问题。
reboot:
conf->reboot = 0;
ret = lxc_start(c->name, argv, conf, c->config_path);
…..
}
reboot 又调用lxc-start 泪奔。
int lxc_start(const char *name, char *const argv[], struct lxc_conf *conf,
const char *lxcpath)
{
struct start_args start_arg = { //又搞了一个start_args实在不懂
.argv = argv,
};
if (lxc_check_inherited(conf, -1))
return -1;
conf->need_utmp_watch = 1;
return __lxc_start(name, conf, &start_ops, &start_arg, lxcpath); //ok,干上了
}
My god 感觉好戏才刚刚开始。。。。
这里面就是lxc-start的全部,所以分开来讲,前面的废话太多,这次看重点
int __lxc_start(const char name, struct lxc_conf *conf,
struct lxc_operations ops, void *data, const char *lxcpath)
{
struct lxc_handler *handler; //结构体,保存container的一些属性
int err = -1;
int status;
int netnsfd = -1;
handler = lxc_init(name, conf, lxcpath); //init
这时候要跳到init中去看看
struct lxc_handler *lxc_init(const char *name, struct lxc_conf *conf, const char *lxcpath)
{
struct lxc_handler *handler;
handler = malloc(sizeof(*handler)); //初始化一堆 handler
if (!handler)
return NULL;
memset(handler, 0, sizeof(*handler));
handler->conf = conf;
handler->lxcpath = lxcpath;
handler->pinfd = -1;
lsm_init();
handler->name = strdup(name);
if (!handler->name) {
ERROR(“failed to allocate memory”);
goto out_free;
}
if (lxc_cmd_init(name, handler, lxcpath)) //cmd_init
goto out_free_name;
if (lxc_read_seccomp_config(conf) != 0) { //这货直接返回0,什么都没有
ERROR(“failed loading seccomp policy”);
goto out_close_maincmd_fd;
}
/* Begin by setting the state to STARTING /
if (lxc_set_state(name, handler, STARTING)) { //STARTING enum 类型
ERROR(“failed to set state ‘%s’”, lxc_state2str(STARTING));
goto out_close_maincmd_fd;
}
/ Start of environment variable setup for hooks /
if (setenv(“LXC_NAME”, name, 1)) {
SYSERROR(“failed to set environment variable for container name”);
}
if (setenv(“LXC_CONFIG_FILE”, conf->rcfile, 1)) {
SYSERROR(“failed to set environment variable for config path”);
}
if (setenv(“LXC_ROOTFS_MOUNT”, conf->rootfs.mount, 1)) {
SYSERROR(“failed to set environment variable for rootfs mount”);
}
if (setenv(“LXC_ROOTFS_PATH”, conf->rootfs.path, 1)) {
SYSERROR(“failed to set environment variable for rootfs mount”);
}
if (conf->console.path && setenv(“LXC_CONSOLE”, conf->console.path, 1)) {
SYSERROR(“failed to set environment variable for console path”);
}
if (conf->console.log_path && setenv(“LXC_CONSOLE_LOGPATH”, conf->console.log_path, 1)) {
SYSERROR(“failed to set environment variable for console log”);
}
Prestart 在这个位置,这个是可以配置到config文件中的
/ End of environment variable setup for hooks /
if (run_lxc_hooks(name, “pre-start”, conf, handler->lxcpath, NULL)) {
ERROR(“failed to run pre-start hooks for container ‘%s’.”, name);
goto out_aborting;
}
//创建tty
if (lxc_create_tty(name, conf)) {
ERROR(“failed to create the ttys”);
goto out_aborting;
}
这个函数打开的是/dev/ptmx这个东西还不是很了解,回头细看
和pts 是主从设备,然后分配pty?
/ the signal fd has to be created before forking otherwise
* if the child process exits before we setup the signal fd,
* the event will be lost and the command will be stuck */
handler->sigfd = setup_signal_fd(&handler->oldmask);
if (handler->sigfd < 0) {
ERROR(“failed to set sigchild fd handler”);
goto out_delete_tty;
}
/* do this after setting up signals since it might unblock SIGWINCH */
if (lxc_console_create(conf)) {
ERROR(“failed to create console”);
goto out_restore_sigmask;
}
if (ttys_shift_ids(conf) < 0) {
ERROR(“Failed to shift tty into container”);
goto out_restore_sigmask;
}
INFO(“‘%s’ is initialized”, name);
return handler;
}
Init完成,回到__lxc_start中
if (!handler) {
ERROR(“failed to initialize the container”);
return -1;
}
handler->ops = ops;
handler->data = data;
// lxc是否支持reboot,配置中handler->conf->need_utmp_watch=1表示支持
if (must_drop_cap_sys_boot(handler->conf)) {
#if HAVE_SYS_CAPABILITY_H
DEBUG(“Dropping cap_sys_boot”);
#else
DEBUG(“Can’t drop cap_sys_boot as capabilities aren’t supported”);
#endif
} else {
DEBUG(“Not dropping cap_sys_boot or watching utmp”);
handler->conf->need_utmp_watch = 0;
}
if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) { //effective uid
/* if the backing store is a device, mount it here and now */
if (rootfs_is_blockdev(conf)) {
if (unshare(CLONE_NEWNS) < 0) {
ERROR(“Error unsharing mounts”);
goto out_fini_nonet;
}
remount_all_slave();
if (do_rootfs_setup(conf, name, lxcpath) < 0) {
ERROR(“Error setting up rootfs mount as root before spawn”);
goto out_fini_nonet;
}
INFO(“Set up container rootfs as host root”);
}
}
err = lxc_spawn(handler);
if (err) {
ERROR(“failed to spawn ‘%s’”, name);
goto out_fini_nonet;
}
Ok 又一个spawn,进去看看
static int lxc_spawn(struct lxc_handler *handler)
{
int failed_before_rename = 0;
const char *name = handler->name;
bool cgroups_connected = false;
int saved_ns_fd[LXC_NS_MAX]; //LXC_NS_MAX
int preserve_mask = 0, i;
int netpipepair[2], nveths; //网络相关
netpipe = -1;
for (i = 0; i < LXC_NS_MAX; i++)
if (handler->conf->inherit_ns_fd[i] != -1) //暂时忽略
preserve_mask |= ns_info[i].clone_flag;
if (lxc_sync_init(handler)) //同步socketpair
return -1;
handler->clone_flags = CLONE_NEWPID|CLONE_NEWNS;
if (!lxc_list_empty(&handler->conf->id_map)) {//id_map空,初始NEWUSER
INFO(“Cloning a new user namespace”);
handler->clone_flags |= CLONE_NEWUSER;
}
//这里开始创建NEWNET了
if (handler->conf->inherit_ns_fd[LXC_NS_NET] == -1) {
if (!lxc_requests_empty_network(handler))
handler->clone_flags |= CLONE_NEWNET;
if (!lxc_list_empty(&handler->conf->network)) {
/* Find gateway addresses from the link device, which is
* no longer accessible inside the container. Do this
* before creating network interfaces, since goto
* out_delete_net does not work before lxc_clone. */
if (lxc_find_gateway_addresses(handler)) {
ERROR(“failed to find gateway addresses”);
lxc_sync_fini(handler);
return -1;
}
/* that should be done before the clone because we will
* fill the netdev index and use them in the child
*/
if (lxc_create_network(handler)) {
ERROR(“failed to create the network”);
lxc_sync_fini(handler);
return -1;
}
}
if (save_phys_nics(handler->conf)) { //save phys nics
ERROR(“failed to save physical nic info”);
goto out_abort;
}
} else {
INFO(“Inheriting a net namespace”);
}
if (handler->conf->inherit_ns_fd[LXC_NS_IPC] == -1) { //NS_IPC
handler->clone_flags |= CLONE_NEWIPC;
} else {
INFO(“Inheriting an IPC namespace”);
}
if (handler->conf->inherit_ns_fd[LXC_NS_UTS] == -1) { //NS_UTS
handler->clone_flags |= CLONE_NEWUTS;
} else {
INFO(“Inheriting a UTS namespace”);
}
if (!cgroup_init(handler)) { //init cgroup
ERROR(“failed initializing cgroup support”);
goto out_delete_net;
}
//这里ops一直为空,搞了半天不知道是怎么初始化ops的
//attribute((constructor))很大可能是这个
cgroups_connected = true;
if (!cgroup_create(handler)) {
ERROR(“failed creating cgroups”);
goto out_delete_net;
}
/*
* if the rootfs is not a blockdev, prevent the container from
* marking it readonly.
*
* if the container is unprivileged then skip rootfs pinning
/
if (lxc_list_empty(&handler->conf->id_map)) { //刚才是空?
handler->pinfd = pin_rootfs(handler->conf->rootfs.path);
if (handler->pinfd == -1)
INFO(“failed to pin the container’s rootfs”);
}
if (preserve_ns(saved_ns_fd, preserve_mask) < 0) //打开/prco/self/ns下面的东西
goto out_delete_net;
if (attach_ns(handler->conf->inherit_ns_fd) < 0) //
goto out_delete_net;
//下面是创建网络的pipe?
if (am_unpriv() && (nveths = count_veths(&handler->conf->network))) {
if (pipe(netpipepair) < 0) {
SYSERROR(“Error creating pipe”);
goto out_delete_net;
}
/ store netpipe in the global var for do_start’s use /
netpipe = netpipepair[0];
}
/ Create a process in a new set of namespaces */
handler->pid = lxc_clone(do_start, handler, handler->clone_flags);
if (handler->pid < 0) {
SYSERROR(“failed to fork into a new namespace”);
goto out_delete_net;
my god lxc_clone 又要跳了。。。
首先看下传递的参数吧
do_start函数指针 ,handler, handler->clone_flags,一堆NS的设置
简述下lxc_clone函数里面的流程
指定一页内存大小做为子进程的栈空间,然后调用系统的clone 进行clone,回头开一章说里面的一些函数调用。
ret = clone(do_clone, stack + stack_size, flags | SIGCHLD, &clone_arg);
static int do_clone(void *arg)
{
struct clone_arg *clone_arg = arg;
return clone_arg->fn(clone_arg->arg);
}
Do_clone里调用刚才的clone的指针do_start
Ok,到do_start中去看,
static int do_start(void *data)
{
struct lxc_handler *handler = data;
const char *lsm_label = NULL;
if (sigprocmask(SIG_SETMASK, &handler->oldmask, NULL)) {
SYSERROR(“failed to set sigprocmask”);
return -1;
}
/* This prctl must be before the synchro, so if the parent
* dies before we set the parent death signal, we will detect
* its death with the synchro right after, otherwise we have
* a window where the parent can exit before we set the pdeath
* signal leading to a unsupervized container.
*/
if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0)) { //和前面的prctl一样
SYSERROR(“failed to set pdeath signal”);
return -1;
}
lxc_sync_fini_parent(handler);
/* don’t leak the pinfd to the container */
if (handler->pinfd >= 0) {
close(handler->pinfd);
}
/* Tell the parent task it can begin to configure the
* container and wait for it to finish
/
if (lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
return -1;
if (read_unpriv_netifindex(&handler->conf->network) < 0)
goto out_warn_father;
/
* if we are in a new user namespace, become root there to have
* privilege over our namespace
/
if (!lxc_list_empty(&handler->conf->id_map)) { //设置gid uid。。
NOTICE(“switching to gid/uid 0 in new user namespace”);
if (setgid(0)) {
SYSERROR(“setgid”);
goto out_warn_father;
}
if (setuid(0)) {
SYSERROR(“setuid”);
goto out_warn_father;
}
if (setgroups(0, NULL)) {
SYSERROR(“setgroups”);
goto out_warn_father;
}
}
#if HAVE_SYS_CAPABILITY_H //这个跟编译时候有关,config中也有一条
if (handler->conf->need_utmp_watch) {
if (prctl(PR_CAPBSET_DROP, CAP_SYS_BOOT, 0, 0, 0)) {
SYSERROR(“failed to remove CAP_SYS_BOOT capability”);
goto out_warn_father;
}
DEBUG(“Dropped cap_sys_boot”);
}
#endif
/ Setup the container, ip, names, utsname, … /
if (lxc_setup(handler)) { //终于要配置container了
ERROR(“failed to setup the container”);
goto out_warn_father;
}
Setup的代码就不放出来了,主要的函数贴上来。
setup_utsname(lxc_conf->utsname)
setup_network(&lxc_conf->network)
run_lxc_hooks(name, “pre-mount”, lxc_conf)
setup_rootfs(lxc_conf)
if (lxc_conf->autodev) mount_autodev(lxc_conf->rootfs.mount)
setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)
run_lxc_hooks(name, “mount”, lxc_conf)
if (lxc_conf->autodev) {
run_lxc_hooks(name, “autodev”, lxc_conf)
setup_autodev(lxc_conf->rootfs.mount) }
setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)
setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)
setup_pivot_root(&lxc_conf->rootfs)
setup_pts(lxc_conf->pts)
setup_personality(lxc_conf->personality)
setup_caps(&lxc_conf->caps)
后面再好好的研究,先把步骤理清。下面几个先看注释了。
/ ask father to setup cgroups and wait for him to finish */
if (lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP))
return -1;
/* Set the label to change to when we exec(2) the container’s init /
if (!strcmp(lsm_name(), “AppArmor”))
lsm_label = handler->conf->lsm_aa_profile;
else if (!strcmp(lsm_name(), “SELinux”))
lsm_label = handler->conf->lsm_se_context;
if (lsm_process_label_set(lsm_label, 1, 1) < 0)
goto out_warn_father;
/ Some init’s such as busybox will set sane tty settings on stdin,
* stdout, stderr which it thinks is the console. We already set them
* the way we wanted on the real terminal, and we want init to do its
* setup on its console ie. the pty allocated in lxc_console_create()
* so make sure that that pty is stdin,stdout,stderr.
*/
if (lxc_console_set_stdfds(handler) < 0)
goto out_warn_father;
/* If we mounted a temporary proc, then unmount it now */
tmp_proc_unmount(handler->conf);
if (lxc_seccomp_load(handler->conf) != 0)
goto out_warn_father;
if (run_lxc_hooks(handler->name, “start”, handler->conf, handler->lxcpath, NULL)) {
ERROR(“failed to run start hooks for container ‘%s’.”, handler->name);
goto out_warn_father;
}
/* The clearenv() and putenv() calls have been moved here
* to allow us to use environment variables passed to the various
* hooks, such as the start hook above. Not all of the
* variables like CONFIG_PATH or ROOTFS are valid in this
* context but others are. /
if (clearenv()) {
SYSERROR(“failed to clear environment”);
/ don’t error out though */
}
if (putenv(“container=lxc”)) {
SYSERROR(“failed to set environment variable”);
goto out_warn_father;
}
close(handler->sigfd);
/* after this call, we are in error because this
* ops should not return as it execs /
handler->ops->start(handler, handler->data); //看怎么跳回去
handler在lxc_start 跳到__lxc_start 的时候就给ops的start 赋值函数指针start了,因此直接跳到start函数中。
static int start(struct lxc_handler *handler, void data)
{
struct start_args *arg = data;
NOTICE(“exec’ing ‘%s’”, arg->argv[0]);
execvp(arg->argv[0], arg->argv);
SYSERROR(“failed to exec %s”, arg->argv[0]);
return 0;
}
这里面开始执行容器的rootfs下面的第一个启动选项,default_args是/sbin/init,可以在start的时候指定。
Ok 从clone中回到 lxc_spawn这个中看后面怎么执行的。
if (attach_ns(saved_ns_fd))
WARN(“failed to restore saved namespaces”);
lxc_sync_fini_child(handler);
//一些cgroup的配置,将对用的namespace写入cgroup中
if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
failed_before_rename = 1;
if (!cgroup_create_legacy(handler)) {
ERROR(“failed to setup the legacy cgroups for %s”, name);
goto out_delete_net;
}
if (!cgroup_setup_limits(handler, false)) {
ERROR(“failed to setup the cgroup limits for ‘%s’”, name);
goto out_delete_net;
}
if (!cgroup_enter(handler))
goto out_delete_net;
if (!cgroup_chown(handler))
goto out_delete_net;
if (failed_before_rename)
goto out_delete_net;
//网络配置
/* Create the network configuration */
if (handler->clone_flags & CLONE_NEWNET) {
if (lxc_assign_network(&handler->conf->network, handler->pid)) {
ERROR(“failed to create the configured network”);
goto out_delete_net;
}
}
if (netpipe != -1) {
struct lxc_list iterator;
struct lxc_netdev *netdev;
close(netpipe);
lxc_list_for_each(iterator, &handler->conf->network) {
netdev = iterator->elem;
if (netdev->type != LXC_NET_VETH)
continue;
if (write(netpipepair[1], netdev->name, IFNAMSIZ) != IFNAMSIZ) {
ERROR(“Error writing veth name to container”);
goto out_delete_net;
}
}
close(netpipepair[1]);
}
/ map the container uids - the container became an invalid
* userid the moment it was cloned with CLONE_NEWUSER - this
* call doesn’t change anything immediately, but allows the
* container to setuid(0) (0 being mapped to something else on
* the host) later to become a valid uid again */
if (lxc_map_ids(&handler->conf->id_map, handler->pid)) {
ERROR(“failed to set up id mapping”);
goto out_delete_net;
}
/* Tell the child to continue its initialization. we’ll get
* LXC_SYNC_CGROUP when it is ready for us to setup cgroups
*/
if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE))
goto out_delete_net;
if (!cgroup_setup_limits(handler, true)) {
ERROR(“failed to setup the devices cgroup for ‘%s’”, name);
goto out_delete_net;
}
cgroup_disconnect();
cgroups_connected = false;
/* Tell the child to complete its initialization and wait for
* it to exec or return an error. (the child will never
* return LXC_SYNC_POST_CGROUP+1. It will either close the
* sync pipe, causing lxc_sync_barrier_child to return
* success, or return a different value, causing us to error
* out).
*/
if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CGROUP))
return -1;
if (detect_shared_rootfs())
umount2(handler->conf->rootfs.mount, MNT_DETACH);
if (handler->ops->post_start(handler, handler->data)) //post_start什么也没干,就赋值,直接return了
goto out_abort;
if (lxc_set_state(name, handler, RUNNING)) {
ERROR(“failed to set state to %s”,
lxc_state2str(RUNNING));
goto out_abort;
}
lxc_sync_fini(handler);
return 0;
现在spaw函数结束,很多细节都没看,里面每个地方估计都要看很久,先把握下流程吧。
netnsfd = get_netns_fd(handler->pid);
err = lxc_poll(name, handler);
if (err) {
ERROR(“mainloop exited with an error”);
if (netnsfd >= 0)
close(netnsfd);
goto out_abort;
}
又来了一个poll函数
看看poll做了什么
调用epoll_create用于通信
lxc_mainloop_open(&descr)
注册各种epoll事件
lxc_mainloop_add_handler(&descr, sigfd, signal_handler, &pid)
lxc_console_mainloop_add(&descr, handler)
lxc_command_mainloop_add(name, &descr, handler)
if (handler->conf->need_utmp_watch) lxc_utmp_mainloop_add(&descr, handler)
最后返回的时候又调用了lxc_mainloop(&descr)
while (waitpid(handler->pid, &status, 0) < 0 && errno == EINTR)
continue;
/*
* If the child process exited but was not signaled,
* it didn’t call reboot. This should mean it was an
* lxc-execute which simply exited. In any case, treat
* it as a ‘halt’
/
if (WIFSIGNALED(status)) {
switch(WTERMSIG(status)) {
case SIGINT: / halt /
DEBUG(“Container halting”);
break;
case SIGHUP: / reboot /
DEBUG(“Container rebooting”);
handler->conf->reboot = 1;
break;
case SIGSYS: / seccomp */
DEBUG(“Container violated its seccomp policy”);
break;
default:
DEBUG(“unknown exit status for init: %d”, WTERMSIG(status));
break;
}
}
lxc_rename_phys_nics_on_shutdown(netnsfd, handler->conf);
if (netnsfd >= 0)
close(netnsfd);
if (handler->pinfd >= 0) {
close(handler->pinfd);
handler->pinfd = -1;
}
lxc_monitor_send_exit_code(name, status, handler->lxcpath);
err = lxc_error_set_and_log(handler->pid, status);
}
1、首先就是第一个lxc_check_inherited函数
dir = opendir(“/proc/self/fd”);
if (!dir) {
WARN(“failed to opendirectory: %m”);
return -1;
}
此函数是根据配置将/proc/self/fd下,关闭fd。
然后就跳到__lxc_start中
2、看下lxc-init
在init中 设置一些关于LXC_XXX的环境变量,猜测用于后面的使用。
可以再lxc启动的时候加一些脚本。
会在hook中先执行pre-start的前缀的脚本
if (run_lxc_hooks(name, “pre-start”, conf,handler->lxcpath, NULL)) {
ERROR(“failed to runpre-start hooks for container ‘%s’.”, name);
goto out_aborting;
}
继续,后面有调用lxc_create_tty,细致研究发现,这个函数是根据conf中设置tty的个数,通过opentty函数来创建pts给容器使用。
ret = openpty(&pty_info->master, &pty_info->slave,pty_info->name,NULL, NULL);
这个可以再config文件中设置tty的个数
tty的作用是,如果容器配置了根文件系统和inittab文件设置启动gettty,同时在inittab中gettty的个数不能超过设置的tty的个数,否则会出问题
同理 lxc_console_create 也是一样
如果容器配置了根文件系统和inittab文件设置使用控制台,您可能希望指定该控制台的输出。可以在config中设置lxc.console.logfile来指定输出的位置,lxc.console指定console的个数
然后通过ttys_shift_ids来设置tty的owner。
这样init的初始化过程就结束了。
3、然后到must_drop_cap_sys_boot(handler->conf)这个步骤中。
这个函数会读系统中/proc/sys/kernel/ctrl-alt-del这个文件,判断确定cmd的命令,cmd = v ?LINUX_REBOOT_CMD_CAD_ON : LINUX_REBOOT_CMD_CAD_OFF;
然后会系统调用clone,其中函数指针为container_reboot_supported,最终会调用reboot这个函数,
通过man reboot可以看到细节
LINUX_REBOOT_CMD_CAD_OFF
(RB_DISABLE_CAD, 0). CAD is disabled. This means that the CAD keystroke will cause a SIGINT signalto be sent to init
(process 1),whereupon this process may decide upon a proper action (maybe: kill allprocesses, sync, reboot).
LINUX_REBOOT_CMD_CAD_ON
(RB_ENABLE_CAD,0x89abcdef). CAD is enabled. This means that the CAD keystroke willimmediately cause the action associated
withLINUX_REBOOT_CMD_RESTART.
那么,问题来了,到底reboot什么东西,系统?还是container?一个已经启动,一个正在start过程。
暂时还没搞懂,是不是NEWPID | NEWUSER 启动的新的namespace的空间中的东西,可能发SIGINT信号给主机的init的进程。将以前启动的container剩余的部分重新启动?先mark一下。 |
4、然后判断if (geteuid() == 0&& !lxc_list_empty(&conf->id_map)),id_map是空的,因为目前所有的的流程,都是以privilegecontainer说的,所有非root的用户就不分析了。
检查rootfs_is_blockdev(conf) 感觉函数是在判断rootfs的路径是否为blockdev,然后remount_all_slave打开/proc/self/mountinfo然后将shared enties 改变到slave中,就看当前的系统有没有share entries了。
然后调用do_rootfs_setup(conf, name,lxcpath) 将container rootfs 挂载上去。同时也通过pre-mount的脚本将自定义的一些mount 加进去,因此,这个地方也可以自己自定义,复用一些东西
然后调用setup_rootfs,先是调用mount(“”,”/”, NULL, MS_SLAVE | MS_REC, 0),mount /,调用bdev_init,初始化rootfs。 |
5、然后进去lxc-spawn这个函数中,在别的地方很多次见到spawn这个函数,只知道spawn的英文意思是产卵的意思。这个函数上次分析,里面有很多事在做。
首先将以前的cloneflag 保存,记得start的刚开始初始化的时候如果没设置,ns_info中都设置默认的-1,然后就是同步handler,没什么好说的。
然后就是讲handler的clone_flags设置CLONE_NEWXXX,获取物理网络,等等设置一堆东西, 然后就要想办法将cgroup与namespace联系到一块了,到cgroup_init里面看看是什么流程。
首先,前面一直迷惑的ops怎么被初始化的问题,
attribute((constructor))
void cgroup_ops_init(void)
这个结构,在函数未调用之前就被执行了,这个回头会在杂篇中讲到,首先程序会根据系统中是否有cgmanager 来使用不同的初始化函数,本文就默认没有cgmanager,调用通用的cgfs_ops_init;返回一个引用值,返回静态变量cgfs_ops;将一些指针赋值,ok,看cgroup_init初始化过程,init指向cgfs_init,因此到cgfs_init这个函数中看一下
首先初始化cgfs_data的数据结构,然后设置cgroup_pattern为全局变量中lxc.cgroup.pattern即在编译中的DEFAULT_CGROUP_PATTERN,默认的是/lxc/%n,这个暂时不知道含义。继续看
然后调用lxc_cgroup_load_meta加载metadata,函数中会判断cgroup的使用情况,然后会调用lxc_cgroup_load_meta2的函数,会查找子系统的白名单,或者指定的hierarchies。
最终返回给handler->cgroup_data。
然后调用cgroup_create(handler)来创建cgroup,调用ops的create,create的指针指向cgfs_create,是个内联函数,最终调用lxc_cgroupfs_create,lxc_cgroupfs_create(d->name,d->cgroup_pattern, md, NULL)用来创建new cgroup
/* we will modify the result of this operation directly,
* so we don’t have to copythe data structure
*/
base_info = (path_pattern[0]== ‘/’) ?
lxc_cgroup_process_info_get_init(meta_data) : //pattern为/lxc/%n
lxc_cgroup_process_info_get_self(meta_data);
if (!base_info)
return NULL;
其中get_init为returnlxc_cgroup_process_info_get(1, meta);pid 为1号进程get数据,根据/proc/1/cgroup中的信息添加到cgroup_process_info的链表中。
new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1,sizeof(char *));
if (!new_cgroup_paths)
goto out_initial_error;
new_cgroup_paths_sub =calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
if (!new_cgroup_paths_sub)
goto out_initial_error;
分配空间
/* find mount points we can use */
for (info_ptr = base_info;info_ptr; info_ptr = info_ptr->next) {
h =info_ptr->hierarchy;
mp =lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
if (!mp) {
ERROR(“Could notfind writable mount point for cgroup hierarchy %d while trying to createcgroup.”, h->index);
gotoout_initial_error;
}
info_ptr->designated_mount_point= mp;
if(lxc_string_in_array(“ns”, (const char **)h->subsystems))
continue;
if(handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
ERROR(“Could notset clone_children to 1 for cpuset hierarchy in parent cgroup.”);
gotoout_initial_error;
}
}
/* normalize the path */
cgroup_path_components =lxc_normalize_path(path_pattern);
if (!cgroup_path_components)
goto out_initial_error;
来看主要的find_name_on_this_level程序块
/* determine name of the path component we should create */
if (contains_name&& suffix > 0) {
char *buf =calloc(strlen(name) + 32, 1);
if (!buf)
gotoout_initial_error;
snprintf(buf, strlen(name)+ 32, “%s-%u”, name, suffix);
current_component =lxc_string_replace(“%n”, buf, p_eff);
free(buf);
} else {
current_component =contains_name ? lxc_string_replace(“%n”, name, p_eff) : p_eff;
}
parts[0] = path_so_far;
parts[1] =current_component;
parts[2] = NULL;
current_subpath =path_so_far ? lxc_string_join(“/”, (const char **)parts, false) :current_component;
/* Now go through each hierarchy and try to create the
* corresponding cgroup
*/
其中最主要的是
r = create_cgroup(info_ptr->designated_mount_point,current_entire_path);来创建cgroup的目录层级。
理一下头绪,cgroup通过cgroup.patternd 的模式,然后读取/proc/1/cgroup下去创建相应的cgroup层级,最后创建cgroup的目录。
6、回到lxc-spawn中,然后到通过一些网络的netpipepair设置,这些都不是我们关心的。
最后调用lxc_clone函数调用do_start来对container进行一系列的初始化操作,首先是lxc_setup 前面也介绍了,通过初始化,mount rootfs,网络,autodev,自动挂载/proc,/sys等文件,然后设置tty,console等设置标准输入输出的位置,等等。
然后可以设置if(run_lxc_hooks(handler->name, “start”, handler->conf,handler->lxcpath, NULL)) start脚本来辅助工作,这个也是可以自定义的内容
最后在do_start函数中调用handler->ops->start(handler,handler->data);
ops为lxc的operation中的内容,来看看想干嘛。execvp(arg->argv[0],arg->argv);执行start container了,这里面,我们用到的是/init不是默认的/sbin/init,因为我们的容器不是标准的容器,所以这点是不同的。
里面注释也谈到了,当我们执行这个/init的时候,函数就不会返回来了,那么后面的程序怎么办?
所以在do_start中子进程一直等到父进程完成工作和配置。
/* Tell the parent task it can begin to configure the
* container and wait for itto finish
*/
if(lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
return -1;
然后父进程进行一系列的配置,其中最主要的就是cgroup的配置,如果容器没有cgroup的话,资源划分就成问题了,
cgroup_setup_limits 资源限制,cgroup_enter将pid进程加入task任务中,等等设置cgroup
然后还是配置网络,将container加入到veth当中,这当年还是要看自己config网络相关的配置,so,网络配置有很多,就忽略网络的问题了。
然后又告诉子进程继续初始化过程
/* Tell the child to continue its initialization. we’ll get
* LXC_SYNC_CGROUP when it isready for us to setup cgroups
*/
if(lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE))
goto out_delete_net;
然后当子进程setup过程完成之后,让父进程设置cgroup,同时父进程设置完cgroup时,也通知子进程完成,此时子进程就真正进入到container的init的进程了。
一直没发现这个LXC_SYNC_POST_CGROUPwait 子进程的信号谁发给他,这个比较疑惑?
最后发现是do_stat这个函数if判断失败后goto的,则表示中间会error,最后还有个post_cgroup,注释是这样说道。
/* Tell the child to complete its initialization and wait for
* it to exec or return anerror. (the child will never
* returnLXC_SYNC_POST_CGROUP+1. It will eitherclose the
* sync pipe, causinglxc_sync_barrier_child to return
* success, or return adifferent value, causing us to error
* out).
*/
if(lxc_sync_barrier_child(handler, LXC_SYNC_POST_CGROUP))
return -1;
然后就是调用post-start,NOTICE 运行的pid,最后设置container的状态为RUNNING,至此spawn就结束了。
回到__lxc_start中,get_netns_fd获得network的状态,然后进入lxc_poll中.后面没什么好说的,现在主要考虑lxc 在exec container的init的进程过后,lxc是如何继续接管程序的。
lxc start部分的源码的大致工作流程已经熟悉,那么就要关注他的核心内容了,就是关于namespace 和 cgroup的内容了。
根据前面的分析已经知道,lxc根据一些配置会自动将flag设置成CLONE_NEWXXX,然后会通过cgroup init 来初始化一堆 cgroup。我们先来看一下。
首先通过cgroup_create 来创建 cgroup,前面介绍都是有个ops 指向函数指针,这里先假设我们用的cgfs,理论上应该和cgroupmanager是一样的方式,可能细节有区别而已。
那么顺理成章create指向cgfs_create,后面就直接说函数指针的位置了。
函数内部通过调用lxc_cgroupfs_create。那么就要从create a newcgroup
static struct cgroup_process_info lxc_cgroupfs_create(const charname, const char path_pattern, struct cgroup_meta_data *meta_data, const charsub_pattern)
char**cgroup_path_components = NULL;
char **p = NULL;
char *path_so_far = NULL;
char **new_cgroup_paths =NULL;
char **new_cgroup_paths_sub =NULL;
struct cgroup_mount_point*mp;
struct cgroup_hierarchy *h;
struct cgroup_process_info*base_info = NULL;
struct cgroup_process_info*info_ptr;
int saved_errno;
int r;
unsigned suffix = 0;
bool had_sub_pattern = false;
size_t i;
if (!is_valid_cgroup(name)){ //判断name 是否有效
ERROR(“Invalidcgroup name: ‘%s’”, name);
errno = EINVAL;
return NULL;
}
if (!strstr(path_pattern,”%n”)) {
ERROR(“Invalidcgroup path pattern: ‘%s’; contains no %%n for specifying container name”,path_pattern);
errno = EINVAL;
return NULL;
}
根据privilege 和unprivilege container的不同读取到proc 下面的pid的不同来确定不同的cgroup 信息。
base_info = (path_pattern[0]== ‘/’) ?
lxc_cgroup_process_info_get_init(meta_data) :
lxc_cgroup_process_info_get_self(meta_data);
if (!base_info)
return NULL;
new_cgroup_paths =calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
if (!new_cgroup_paths)
goto out_initial_error;
在自己机子上面,看到的cgroup:
gudh@lxc-D3F2-CM:~$ cat/proc/self/cgroup
11:name=systemd:/user/1004.user/5.session
10:hugetlb:/user/1004.user/5.session
9:perf_event:/user/1004.user/5.session
8:blkio:/user/1004.user/5.session
7:freezer:/user/1004.user/5.session
6:devices:/user/1004.user/5.session
5:memory:/user/1004.user/5.session
4:cpuacct:/user/1004.user/5.session
3:cpu:/user/1004.user/5.session
2:cpuset:/user/1004.user/5.session
gudh@lxc-D3F2-CM:~$ id
uid=1004(gudh)gid=1004(gudh) groups=1004(gudh),0(root),4(adm)
gudh@lxc-D3F2-CM:~$ cat/proc/1/cgroup
11:name=systemd:/
10:hugetlb:/
9:perf_event:/
8:blkio:/
7:freezer:/
6:devices:/
5:memory:/
4:cpuacct:/
3:cpu:/
2:cpuset:/
然后就是分配path的大小
new_cgroup_paths_sub =calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
if (!new_cgroup_paths_sub)
goto out_initial_error;
查找可以挂载的点,然后创建。
/* find mount points we canuse */
for (info_ptr = base_info; info_ptr;info_ptr = info_ptr->next) {
h = info_ptr->hierarchy;
mp = lxc_cgroup_find_mount_point(h,info_ptr->cgroup_path, true);
if (!mp) {
ERROR(“Could not find writablemount point for cgroup hierarchy %d while trying to create cgroup.”,h->index);
goto out_initial_error;
}
info_ptr->designated_mount_point =mp;
if (lxc_string_in_array(“ns”,(const char **)h->subsystems))
continue;
if (handle_cgroup_settings(mp,info_ptr->cgroup_path) < 0) {
ERROR(“Could not setclone_children to 1 for cpuset hierarchy in parent cgroup.”);
goto out_initial_error;
}
}
cgroup_path_components = lxc_normalize_path(path_pattern);
if (!cgroup_path_components)
goto out_initial_error;
然后根据normalize的path去创建他们。
/* go through the pathcomponents to see if we can create them */
for (p = cgroup_path_components; *p | (sub_pattern && !had_sub_pattern); p++) { |
/* we only want to create the samecomponent with -1, -2, etc.
* if the component contains thecontainer name itself, otherwise
* it’s not an error if it alreadyexists
*/
char p_eff = *p ? *p : (char)sub_pattern;
bool contains_name = strstr(p_eff,”%n”);
char *current_component = NULL;
char *current_subpath = NULL;
char *current_entire_path = NULL;
char *parts[3];
size_t j = 0;
i = 0;
/* if we are processing the subpattern, we want to make sure
* loop is ended the next time around
*/
if (!*p) {
had_sub_pattern = true;
p–;
}
然后就到find_name_on_this_level,这里面pattern 应该是/lxc/%n
goto find_name_on_this_level;
find_name_on_this_level:
/* determine name of the path componentwe should create */
if (contains_name && suffix> 0) {
char *buf = calloc(strlen(name) +32, 1);
if (!buf)
goto out_initial_error;
snprintf(buf, strlen(name) + 32,”%s-%u”, name, suffix);
current_component =lxc_string_replace(“%n”, buf, p_eff);
free(buf);
} else {
current_component = contains_name ?lxc_string_replace(“%n”, name, p_eff) : p_eff;
}
parts[0] = path_so_far;
parts[1] = current_component;
parts[2] = NULL;
current_subpath = path_so_far ?lxc_string_join(“/”, (const char **)parts, false) :current_component;
紧接着创建相应的cgroup
for (i = 0, info_ptr =base_info; info_ptr; info_ptr = info_ptr->next, i++) {
char *parts2[3];
if(lxc_string_in_array(“ns”, (const char**)info_ptr->hierarchy->subsystems))
continue;
current_entire_path = NULL;
parts2[0] =!strcmp(info_ptr->cgroup_path, “/”) ? “” :info_ptr->cgroup_path;
parts2[1] = current_subpath;
parts2[2] = NULL;
current_entire_path = lxc_string_join(“/”,(const char **)parts2, false);
if (!*p) {
/* we are processing thesubpath, so only update that one */
free(new_cgroup_paths_sub[i]);
new_cgroup_paths_sub[i] =strdup(current_entire_path);
if (!new_cgroup_paths_sub[i])
goto cleanup_from_error;
} else {
/* remember which path was usedon this controller */
free(new_cgroup_paths[i]);
new_cgroup_paths[i] =strdup(current_entire_path);
if (!new_cgroup_paths[i])
goto cleanup_from_error;
}
r =create_cgroup(info_ptr->designated_mount_point, current_entire_path);
这样就完成相应的代码设置。
对于pattern 为/lxc/%n 就分两次不同创建在相应的目录,这样cgroup subpath 也同时受到顶层/lxc 的控制,cgroup就成功创建了。
然后就到cgroup_create_legacy最终调用lxc_cgroup_create_legacy
直接看注释
/*
* if cgroup is mounted at/cgroup and task is in cgroup /ab/, pid 2375 and
* name is c1,
* dir: /ab
* fulloldpath =/cgroup/ab/2375
* fullnewpath =/cgroup/ab/c1
* newname = /ab/c1
*/
如果老名字为/sys/cgroup/cpu/lxc/android/2375
那么就改成/sys/cgroup/cpu/lxc/android/android?
加入cgroup一些创建file的 capability
cgroup_setup_limits 名字很明显设置限额 with_device是false
将在config中加入的device.allow 和device.deny 配置
手动设置的地方
然后就是cgfs_enter 最后到lxc_cgroupfs_enter
lxc_cgroup_find_mount_point 查找path下面的mount point
cgroup_to_absolute_path absolute path
lxc_write_to_file然后将pid写入到cgroup的absolutepath下面
这样就将pid 与cgroup成功绑定。
cgroup_chown chown的指针目前是NULL 暂时不分析
后面又来了一次 cgroup_setup_limits 这是with_device 是true
此时应该就完成了cgroup的相关设置