lxc-start 源码分析

https://github.com/lxc/lxc
lxc_start.c
int main(int argc, char *argv[])
{
int err = 1;
struct lxc_conf *conf; //初始化config结构
char *const *args; //传递的参数
char *rcfile = NULL; //指定配置文件
char *const default_args[] = { //默认的args参数
“/sbin/init”,
NULL,
};

struct lxc_container *c; //lxc-container 的结构体
….
}

lxc_conf这个数据结构
struct lxc_conf {
int is_execute; //容器是否在执行
char *fstab; //fstab?
int tty; //tty的个数
int pts; //pts的个数?
int reboot; //重启?
int need_utmp_watch; //字面翻译 需要utmp 查看
signed long personality; //字面翻译 特点
struct utsname *utsname; //ustname
struct lxc_list cgroup; //cgroup list lxc_list只是简单的链表结构
struct lxc_list id_map; //id_map list
struct lxc_list network; //network list
struct saved_nic *saved_nics;//saved_nics 结构
int num_savednics; //savednics数量?
int auto_mounts; //auto_mounts?
struct lxc_list mount_list; //mount_list list?
struct lxc_list caps; //caps list?
struct lxc_list keepcaps; //keepcaps list?
struct lxc_tty_info tty_info; //tty的相关信息
struct lxc_console console; //console的结构体
struct lxc_rootfs rootfs; //rootfs的结构体
char *ttydir; //tty目录
int close_all_fds; //关闭所有fd
struct lxc_list hooks[NUM_LXC_HOOKS]; //hooks 函数



char *lsm_aa_profile;   //?
char *lsm_se_context; //?
int tmp_umount_proc; //?
char *seccomp; // filename with the seccomp rules #if HAVE_SCMP_FILTER_CTX
scmp_filter_ctx seccomp_ctx; #endif
int maincmd_fd; //?
int autodev; // if 1, mount and fill a /dev at start
int haltsignal; // signal used to halt container
int stopsignal; // signal used to hard stop container
int kmsg; // if 1, create /dev/kmsg symlink
char *rcfile; // Copy of the top level rcfile we read

// Logfile and loglevel can be set in a container config file.
// Those function as defaults. The defaults can be overriden
// by command line. However we don't want the command line
// specified values to be saved on c->save_config(). So we
// store the config file specified values here.
char *logfile; // the logfile as specifed in config
int loglevel; // loglevel as specifed in config (if any)

int inherit_ns_fd[LXC_NS_MAX];

int start_auto;
int start_delay;
int start_order;
struct lxc_list groups;
int nbd_idx;

/* set to true when rootfs has been setup */
bool rootfs_setup; };


lxc_container的结构体



/*!




  • An LXC container.
    */
    struct lxc_container {
    // private fields
    char *name; //container 的名字
    char *configfile; // configuration file 的路径
    char *pidfile; // 存储pid 的文件名
    struct lxc_lock *slock; //Container semaphore lock. 容器的信号锁
    struct lxc_lock *privlock;//容器的私有信号锁
    int numthreads; //容器的引用数量,由privlock保护
    struct lxc_conf *lxc_conf;



    // public fields
    char *error_string; //全局变量 可读的最后显示的error
    int error_num; //最后error的数字
    bool daemonize; //容器是否希望开启守护进程
    char *config_path; // configuration file 的路径 和上面的区别? 全局?
    ……. //一堆成员函数 暂不看
    }





lxc_list_init(&defines);             //初始化list



defines定义在文件开始,为全局变量



static structlxc_list defines;



if(lxc_caps_init())                    //caps初始化



        return err;



到这个函数里看一下。



int lxc_caps_init(void)
{
uid_t uid = getuid();
gid_t gid = getgid();
uid_t euid = geteuid(); //有效uid



if (!uid) {				//root权限运行的话就省了后面的步骤了
INFO("command is run as 'root'");
return 0;
}

if (uid && !euid) {
INFO("command is run as setuid root (uid : %d)", uid);

if (prctl(PR_SET_KEEPCAPS, 1)) { //prctl 设置进程的选项,为下面set?
ERROR("failed to 'PR_SET_KEEPCAPS': %m");
return -1;
}

if (setresgid(gid, gid, gid)) {
ERROR("failed to change gid to '%d': %m", gid);
return -1;
}

if (setresuid(uid, uid, uid)) {
ERROR("failed to change uid to '%d': %m", uid);
return -1;
}

if (lxc_caps_up()) {
ERROR("failed to restore capabilities: %m");
return -1;
}
}


if (uid == euid)
INFO(“command is run as user ‘%d’”, uid);



return 0;
}



接着就是读传过来的参数



if(lxc_arguments_parse(&my_args, argc, argv))



        return err;



这个函数就没细看,只需知道将参数传给my_args



判断有没有指定 初始执行的参数,没有的话指定默认参数



if (!my_args.argc)



        args = default_args;



    else       



        args = my_args.argv;



 



初始化一堆log的,暂时也没细看



if (lxc_log_init(my_args.name, my_args.log_file, my_args.log_priority,
my_args.progname, my_args.quiet, my_args.lxcpath[0]))
return err;
lxc_log_options_no_override();



const char lxcpath = my_args.lxcpath[0]; //lxcpath 很有意思
// lxc_global_config_value(“lxc.lxcpath”)这个写的还是比较复杂的,总之lxcpath会是默认的路径
//指定config的位置,如果没指定,则使用默认的路径的config,通过配置创建新的
/

* rcfile possibilities:
* 1. rcfile from random path specified in cli option
* 2. rcfile not specified, use $lxcpath/$lxcname/config
* 3. rcfile not specified and does not exist.
/
/
rcfile is specified in the cli option */
if (my_args.rcfile) {
rcfile = (char *)my_args.rcfile;
c = lxc_container_new(my_args.name, lxcpath);
if (!c) {
ERROR(“Failed to create lxc_container”);
return err;
}
c->clear_config(c);
if (!c->load_config(c, rcfile)) {
ERROR(“Failed to load rcfile”);
lxc_container_put(c);
return err;
}
}
} else {
int rc;



    rc = asprintf(&rcfile, "%s/%s/config", lxcpath, my_args.name);
if (rc == -1) {
SYSERROR("failed to allocate memory");
return err;
}
INFO("using rcfile %s", rcfile);

/* container configuration does not exist */
if (access(rcfile, F_OK)) {
free(rcfile);
rcfile = NULL;
}
c = lxc_container_new(my_args.name, lxcpath);
if (!c) {
ERROR("Failed to create lxc_container");
return err;
} }


里面最主要的函数c = lxc_container_new(my_args.name, lxcpath);



struct lxc_container *lxc_container_new(const char *name, const char *configpath)
{
struct lxc_container *c; //结构体lxc_container 前面分析过了



c = malloc(sizeof(*c));			//创建
if (!c) {
fprintf(stderr, "failed to malloc lxc_container\n");
return NULL;
}
memset(c, 0, sizeof(*c)); //初始0

if (configpath)
c->config_path = strdup(configpath); //config_path
else
c->config_path = strdup(lxc_global_config_value("lxc.lxcpath"));

if (!c->config_path) {
fprintf(stderr, "Out of memory\n");
goto err;
}

remove_trailing_slashes(c->config_path);
c->name = malloc(strlen(name)+1);
if (!c->name) {
fprintf(stderr, "Error allocating lxc_container name\n");
goto err;
}
strcpy(c->name, name);

c->numthreads = 1;
// lock这部分没细看
if (!(c->slock = lxc_newlock(c->config_path, name))) {
fprintf(stderr, "failed to create lock\n");
goto err;
}
if (!(c->privlock = lxc_newlock(NULL, NULL))) {
fprintf(stderr, "failed to alloc privlock\n");
goto err;
}
// set config path
if (!set_config_filename(c)) {
fprintf(stderr, "Error allocating config file pathname\n");
goto err;
}
//load config path
if (file_exists(c->configfile) && !lxcapi_load_config(c, NULL))
goto err;
//判断容器是否创建失败
if (ongoing_create(c) == 2) {
ERROR("Error: %s creation was not completed", c->name);
lxcapi_destroy(c);
lxcapi_clear_config(c);
}
c->daemonize = true;
c->pidfile = NULL;
…… //后面都是成员函数赋值 } 现在回到lxc_start 的main函数中 //判断容器是否在运行 if (c->is_running(c)) {
ERROR("Container is already running.");
err = 0;
goto out; } /* * We should use set_config_item() over &defines, which would handle * unset c->lxc_conf for us and let us not use lxc_config_define_load() */ //加载config文件 if (!c->lxc_conf)
c->lxc_conf = lxc_conf_init(); conf = c->lxc_conf; if (lxc_config_define_load(&defines, conf))
goto out; //提示信息 if (!rcfile && !strcmp("/sbin/init", args[0])) {
ERROR("Executing '/sbin/init' with no configuration file may crash the host");
goto out;
}

if (ensure_path(&conf->console.path, my_args.console) < 0) {
ERROR("failed to ensure console path '%s'", my_args.console);
goto out;
}

if (ensure_path(&conf->console.log_path, my_args.console_log) < 0) {
ERROR("failed to ensure console log '%s'", my_args.console_log);
goto out;
}
// pid 文件
if (my_args.pidfile != NULL) {
if (ensure_path(&c->pidfile, my_args.pidfile) < 0) {
ERROR("failed to ensure pidfile '%s'", my_args.pidfile);
goto out;
}
}
//一些share_ns 的配置,未细看
int i;
for (i = 0; i < LXC_NS_MAX; i++) {
if (my_args.share_ns[i] == NULL)
continue;

int pid = pid_from_lxcname(my_args.share_ns[i], lxcpath);
if (pid < 1)
goto out;

int fd = open_ns(pid, ns_info[i].proc_name);
if (fd < 0)
goto out;
conf->inherit_ns_fd[i] = fd;
}
//初始化为1
if (!my_args.daemonize) {
c->want_daemonize(c, false);
}

if (my_args.close_all_fds)
c->want_close_all_fds(c, true);

err = c->start(c, 0, args) ? 0 : 1;

if (err) {
ERROR("The container failed to start.");
if (my_args.daemonize)
ERROR("To get more details, run the container in foreground mode.");
ERROR("Additional information can be obtained by setting the "
"--logfile and --logpriority options.");
err = c->error_num;
lxc_container_put(c);
return err;
}


out:
lxc_container_put(c);
return err;
}
直接到c->start 过程start是调用 lxcapi_start 这个函数指针,现在去看下这个函数到底是怎么讲lxc container 启动起来的。



    传过来的参数是container c,useinit 0,argv=args 即指定的初始化程序



static bool lxcapi_start(struct lxc_container *c, int useinit, char * const argv[])
{
int ret;
struct lxc_conf *conf;
bool daemonize = false; //守护进程为false
FILE *pid_fp = NULL; //pid_file文件的指针
char *default_args[] = { //又是default_args
“/sbin/init”,
NULL,
};



/* container exists */
if (!c) //判断容器是否存在
return false;
/* container has been setup */
if (!c->lxc_conf) //config加载完美
return false;

if ((ret = ongoing_create(c)) < 0) { //容器是否创建完整
ERROR("Error checking for incomplete creation");
return false;
}
if (ret == 2) {
ERROR("Error: %s creation was not completed", c->name);
c->destroy(c);
return false;
} else if (ret == 1) {
ERROR("Error: creation of %s is ongoing", c->name);
return false;
}

/* is this app meant to be run through lxcinit, as in lxc-execute? */
if (useinit && !argv) //还是判断
return false;

if (container_mem_lock(c)) //lock
return false;
conf = c->lxc_conf; //conf赋值
daemonize = c->daemonize; //true
container_mem_unlock(c); //unlock

if (useinit) { //0
ret = lxc_execute(c->name, argv, 1, conf, c->config_path);
return ret == 0 ? true : false;
}

if (!argv)
argv = default_args; //又重新判断 args 是否为空,空即赋值
/*
* say, I'm not sure - what locks do we want here? Any?
* Is liblxc's locking enough here to protect the on disk
* container? We don't want to exclude things like lxc_info
* while container is running...
* 这段注释给跪了,还是老老实实看他想干嘛吧
*/
if (daemonize) { //true
lxc_monitord_spawn(c->config_path); //start好像跟前面的版本差别

pid_t pid = fork();
if (pid < 0)
return false;

if (pid != 0) {
/* Set to NULL because we don't want father unlink
* the PID file, child will do the free and unlink.
*/
c->pidfile = NULL;
return wait_on_daemonized_start(c, pid); //等下进去,里面有waitpid,所以先看后面
}

/* second fork to be reparented by init */
pid = fork(); //两次fork
if (pid < 0) {
SYSERROR("Error doing dual-fork");
return false;
}
if (pid != 0)
exit(0);
/* like daemon(), chdir to / and redirect 0,1,2 to /dev/null */
if (chdir("/")) { //root目录
SYSERROR("Error chdir()ing to /.");
return false;
}
lxc_check_inherited(conf, -1);
close(0); //pipe file? close(1);
close(2);
open("/dev/zero", O_RDONLY);
open("/dev/null", O_RDWR);
open("/dev/null", O_RDWR);
setsid();
} else {
if (!am_single_threaded()) {
ERROR("Cannot start non-daemonized container when threaded");
return false;
}
} /* We need to write PID file after daeminize, so we always
* write the right PID.
*/
if (c->pidfile) { //写入pid 到pidfile
pid_fp = fopen(c->pidfile, "w");
if (pid_fp == NULL) {
SYSERROR("Failed to create pidfile '%s' for '%s'",
c->pidfile, c->name);
return false;
}

if (fprintf(pid_fp, "%d\n", getpid()) < 0) {
SYSERROR("Failed to write '%s'", c->pidfile);
fclose(pid_fp);
pid_fp = NULL;
return false;
}

fclose(pid_fp);
pid_fp = NULL;
}


reboot:
…..
}



现在到 wait_on_daemonized_start(c, pid) 里面看看函数调用的情况



这个就是主线程的pid 在等待其他子线程工作完,然后执行,只能硬着头皮继续看了。



static bool wait_on_daemonized_start(struct lxc_container c, int pid)
{
/
we’ll probably want to make this timeout configurable? */
int timeout = 5, ret, status;



/*
* our child is going to fork again, then exit. reap the
* child
*/
ret = waitpid(pid, &status, 0);
if (ret == -1 || !WIFEXITED(status) || WEXITSTATUS(status) != 0)
DEBUG("failed waiting for first dual-fork child");
return lxcapi_wait(c, "RUNNING", timeout); } 函数很简单 直接调用了lxcapi_wait。


static bool lxcapi_wait(struct lxc_container *c, const char *state, int timeout)
{
int ret;



if (!c)
return false;

ret = lxc_wait(c->name, state, timeout, c->config_path);
return ret == 0; }


这个依旧很简单又跳走了。。。lxc_wait了



这个函数现在先不细说了,只是检查容器创建是否超时的问题。



reboot:
    conf->reboot = 0;
    ret = lxc_start(c->name, argv, conf, c->config_path);
    …..
}
reboot 又调用lxc-start 泪奔。
int lxc_start(const char *name, char *const argv[], struct lxc_conf *conf,
          const char *lxcpath)
{
    struct start_args start_arg = { //又搞了一个start_args实在不懂
        .argv = argv,
    };   



    if (lxc_check_inherited(conf, -1)) 
        return -1;



    conf->need_utmp_watch = 1; 
    return __lxc_start(name, conf, &start_ops, &start_arg, lxcpath); //ok,干上了
}
My god 感觉好戏才刚刚开始。。。。
这里面就是lxc-start的全部,所以分开来讲,前面的废话太多,这次看重点
int __lxc_start(const char name, struct lxc_conf *conf,
        struct lxc_operations
ops, void *data, const char *lxcpath)
{
    struct lxc_handler *handler; //结构体,保存container的一些属性
    int err = -1;
    int status;
    int netnsfd = -1;



handler = lxc_init(name, conf, lxcpath); //init
这时候要跳到init中去看看
struct lxc_handler *lxc_init(const char *name, struct lxc_conf *conf, const char *lxcpath)
{
    struct lxc_handler *handler;



    handler = malloc(sizeof(*handler)); //初始化一堆 handler
    if (!handler)
        return NULL;



    memset(handler, 0, sizeof(*handler));



    handler->conf = conf;
    handler->lxcpath = lxcpath;
    handler->pinfd = -1;



    lsm_init();



handler->name = strdup(name);
if (!handler->name) {
        ERROR(“failed to allocate memory”);
        goto out_free;
   }
 if (lxc_cmd_init(name, handler, lxcpath)) //cmd_init
        goto out_free_name;
if (lxc_read_seccomp_config(conf) != 0) { //这货直接返回0,什么都没有
       ERROR(“failed loading seccomp policy”);
     goto out_close_maincmd_fd;
}
/* Begin by setting the state to STARTING /
    if (lxc_set_state(name, handler, STARTING)) { //STARTING enum 类型
        ERROR(“failed to set state ‘%s’”, lxc_state2str(STARTING));
        goto out_close_maincmd_fd;
    }
/
Start of environment variable setup for hooks /
    if (setenv(“LXC_NAME”, name, 1)) {
        SYSERROR(“failed to set environment variable for container name”);
    }
    if (setenv(“LXC_CONFIG_FILE”, conf->rcfile, 1)) {
        SYSERROR(“failed to set environment variable for config path”);
    }
    if (setenv(“LXC_ROOTFS_MOUNT”, conf->rootfs.mount, 1)) {
        SYSERROR(“failed to set environment variable for rootfs mount”);
    }
    if (setenv(“LXC_ROOTFS_PATH”, conf->rootfs.path, 1)) {
        SYSERROR(“failed to set environment variable for rootfs mount”);
    }
    if (conf->console.path && setenv(“LXC_CONSOLE”, conf->console.path, 1)) {
        SYSERROR(“failed to set environment variable for console path”);
    }
    if (conf->console.log_path && setenv(“LXC_CONSOLE_LOGPATH”, conf->console.log_path, 1)) {
        SYSERROR(“failed to set environment variable for console log”);
}
Prestart 在这个位置,这个是可以配置到config文件中的
/
End of environment variable setup for hooks /
if (run_lxc_hooks(name, “pre-start”, conf, handler->lxcpath, NULL)) {
     ERROR(“failed to run pre-start hooks for container ‘%s’.”, name);
     goto out_aborting;
}
//创建tty 
if (lxc_create_tty(name, conf)) {
        ERROR(“failed to create the ttys”);
        goto out_aborting;
}
这个函数打开的是/dev/ptmx这个东西还不是很了解,回头细看
和pts 是主从设备,然后分配pty?
/
the signal fd has to be created before forking otherwise
     * if the child process exits before we setup the signal fd,
     * the event will be lost and the command will be stuck */
    handler->sigfd = setup_signal_fd(&handler->oldmask);
    if (handler->sigfd < 0) {
        ERROR(“failed to set sigchild fd handler”);
        goto out_delete_tty;
    }



    /* do this after setting up signals since it might unblock SIGWINCH */
    if (lxc_console_create(conf)) {
        ERROR(“failed to create console”);
        goto out_restore_sigmask;
    }



    if (ttys_shift_ids(conf) < 0) {
        ERROR(“Failed to shift tty into container”);
        goto out_restore_sigmask;
    }



    INFO(“‘%s’ is initialized”, name);
    return handler;
}
Init完成,回到__lxc_start中
if (!handler) {
        ERROR(“failed to initialize the container”);
        return -1;
    }
    handler->ops = ops;
    handler->data = data;
// lxc是否支持reboot,配置中handler->conf->need_utmp_watch=1表示支持
    if (must_drop_cap_sys_boot(handler->conf)) {
        #if HAVE_SYS_CAPABILITY_H
        DEBUG(“Dropping cap_sys_boot”);
        #else
        DEBUG(“Can’t drop cap_sys_boot as capabilities aren’t supported”);
        #endif
    } else {
        DEBUG(“Not dropping cap_sys_boot or watching utmp”);
        handler->conf->need_utmp_watch = 0;
}
if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) { //effective uid
        /* if the backing store is a device, mount it here and now */
        if (rootfs_is_blockdev(conf)) {
            if (unshare(CLONE_NEWNS) < 0) {
                ERROR(“Error unsharing mounts”);
                goto out_fini_nonet;
            }
            remount_all_slave();
            if (do_rootfs_setup(conf, name, lxcpath) < 0) {
                ERROR(“Error setting up rootfs mount as root before spawn”);
                goto out_fini_nonet;
            }
            INFO(“Set up container rootfs as host root”);
        }
    }
err = lxc_spawn(handler);
    if (err) {
        ERROR(“failed to spawn ‘%s’”, name);
        goto out_fini_nonet;
}
Ok 又一个spawn,进去看看
static int lxc_spawn(struct lxc_handler *handler)
{
    int failed_before_rename = 0;
    const char *name = handler->name;
    bool cgroups_connected = false;
    int saved_ns_fd[LXC_NS_MAX]; //LXC_NS_MAX
    int preserve_mask = 0, i;
    int netpipepair[2], nveths; //网络相关



    netpipe = -1;
for (i = 0; i < LXC_NS_MAX; i++)
        if (handler->conf->inherit_ns_fd[i] != -1) //暂时忽略
            preserve_mask |= ns_info[i].clone_flag;



    if (lxc_sync_init(handler)) //同步socketpair
        return -1;



    handler->clone_flags = CLONE_NEWPID|CLONE_NEWNS;
    if (!lxc_list_empty(&handler->conf->id_map)) {//id_map空,初始NEWUSER
 INFO(“Cloning a new user namespace”);
        handler->clone_flags |= CLONE_NEWUSER;
    }
//这里开始创建NEWNET了
if (handler->conf->inherit_ns_fd[LXC_NS_NET] == -1) {
        if (!lxc_requests_empty_network(handler))
            handler->clone_flags |= CLONE_NEWNET;
    
        if (!lxc_list_empty(&handler->conf->network)) {
    
            /* Find gateway addresses from the link device, which is
             * no longer accessible inside the container. Do this
             * before creating network interfaces, since goto
             * out_delete_net does not work before lxc_clone. */
            if (lxc_find_gateway_addresses(handler)) {
                ERROR(“failed to find gateway addresses”);
                lxc_sync_fini(handler);
                return -1;
            }



            /* that should be done before the clone because we will
             * fill the netdev index and use them in the child
             */
            if (lxc_create_network(handler)) {
                ERROR(“failed to create the network”);
                lxc_sync_fini(handler);
                return -1;
            }
        }
if (save_phys_nics(handler->conf)) { //save phys nics
            ERROR(“failed to save physical nic info”);
            goto out_abort;
        }
    } else {
        INFO(“Inheriting a net namespace”);
    }
if (handler->conf->inherit_ns_fd[LXC_NS_IPC] == -1) { //NS_IPC
        handler->clone_flags |= CLONE_NEWIPC;
    } else {
        INFO(“Inheriting an IPC namespace”);
    }



    if (handler->conf->inherit_ns_fd[LXC_NS_UTS] == -1) { //NS_UTS
        handler->clone_flags |= CLONE_NEWUTS;
    } else {
        INFO(“Inheriting a UTS namespace”);
    }
if (!cgroup_init(handler)) { //init cgroup
        ERROR(“failed initializing cgroup support”);
        goto out_delete_net;
    }
//这里ops一直为空,搞了半天不知道是怎么初始化ops的
//attribute((constructor))很大可能是这个
    cgroups_connected = true;



    if (!cgroup_create(handler)) {
        ERROR(“failed creating cgroups”);
        goto out_delete_net;
    }
/*
     * if the rootfs is not a blockdev, prevent the container from
     * marking it readonly.
     *
     * if the container is unprivileged then skip rootfs pinning
     /
    if (lxc_list_empty(&handler->conf->id_map)) { //刚才是空?
        handler->pinfd = pin_rootfs(handler->conf->rootfs.path);
        if (handler->pinfd == -1)
            INFO(“failed to pin the container’s rootfs”);
    }
if (preserve_ns(saved_ns_fd, preserve_mask) < 0) //打开/prco/self/ns下面的东西
        goto out_delete_net;
    if (attach_ns(handler->conf->inherit_ns_fd) < 0) //
        goto out_delete_net;
//下面是创建网络的pipe?
    if (am_unpriv() && (nveths = count_veths(&handler->conf->network))) {
        if (pipe(netpipepair) < 0) {
            SYSERROR(“Error creating pipe”);
            goto out_delete_net;
        }
        /
store netpipe in the global var for do_start’s use /
        netpipe = netpipepair[0];
    }
/
Create a process in a new set of namespaces */
    handler->pid = lxc_clone(do_start, handler, handler->clone_flags);
    if (handler->pid < 0) {
        SYSERROR(“failed to fork into a new namespace”);
        goto out_delete_net;
my god lxc_clone 又要跳了。。。
首先看下传递的参数吧
do_start函数指针 ,handler, handler->clone_flags,一堆NS的设置
简述下lxc_clone函数里面的流程
指定一页内存大小做为子进程的栈空间,然后调用系统的clone 进行clone,回头开一章说里面的一些函数调用。
ret = clone(do_clone, stack  + stack_size, flags | SIGCHLD, &clone_arg);
static int do_clone(void *arg)
{
    struct clone_arg *clone_arg = arg;
    return clone_arg->fn(clone_arg->arg);
}
Do_clone里调用刚才的clone的指针do_start
Ok,到do_start中去看,
static int do_start(void *data)
{
    struct lxc_handler *handler = data;
    const char *lsm_label = NULL;



    if (sigprocmask(SIG_SETMASK, &handler->oldmask, NULL)) {
        SYSERROR(“failed to set sigprocmask”);
        return -1;
    }



        /* This prctl must be before the synchro, so if the parent
     * dies before we set the parent death signal, we will detect
     * its death with the synchro right after, otherwise we have
     * a window where the parent can exit before we set the pdeath
     * signal leading to a unsupervized container.
     */
    if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0)) { //和前面的prctl一样
        SYSERROR(“failed to set pdeath signal”);
        return -1;
    }



lxc_sync_fini_parent(handler);
/* don’t leak the pinfd to the container */
    if (handler->pinfd >= 0) {
        close(handler->pinfd);
    }



    /* Tell the parent task it can begin to configure the
     * container and wait for it to finish
     /
    if (lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
        return -1;
        
    if (read_unpriv_netifindex(&handler->conf->network) < 0)
        goto out_warn_father;
/

     * if we are in a new user namespace, become root there to have
     * privilege over our namespace
     /
    if (!lxc_list_empty(&handler->conf->id_map)) { //设置gid uid。。
        NOTICE(“switching to gid/uid 0 in new user namespace”);
        if (setgid(0)) {
            SYSERROR(“setgid”);
            goto out_warn_father;
        }
        if (setuid(0)) {
            SYSERROR(“setuid”);
            goto out_warn_father;
        }
        if (setgroups(0, NULL)) {
            SYSERROR(“setgroups”);
            goto out_warn_father;
        }
    }
#if HAVE_SYS_CAPABILITY_H //这个跟编译时候有关,config中也有一条
    if (handler->conf->need_utmp_watch) {
        if (prctl(PR_CAPBSET_DROP, CAP_SYS_BOOT, 0, 0, 0)) {
            SYSERROR(“failed to remove CAP_SYS_BOOT capability”);
            goto out_warn_father;
        }
        DEBUG(“Dropped cap_sys_boot”);
    }
#endif
/
Setup the container, ip, names, utsname, … /
    if (lxc_setup(handler)) { //终于要配置container了
        ERROR(“failed to setup the container”);
        goto out_warn_father;
    }
Setup的代码就不放出来了,主要的函数贴上来。
setup_utsname(lxc_conf->utsname)
setup_network(&lxc_conf->network)
run_lxc_hooks(name, “pre-mount”, lxc_conf)
setup_rootfs(lxc_conf)
if (lxc_conf->autodev) mount_autodev(lxc_conf->rootfs.mount)
setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)
run_lxc_hooks(name, “mount”, lxc_conf)
if (lxc_conf->autodev) {
run_lxc_hooks(name, “autodev”, lxc_conf)
setup_autodev(lxc_conf->rootfs.mount) }
setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)
setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)
setup_pivot_root(&lxc_conf->rootfs)
setup_pts(lxc_conf->pts)
setup_personality(lxc_conf->personality)
setup_caps(&lxc_conf->caps)
后面再好好的研究,先把步骤理清。下面几个先看注释了。
/
ask father to setup cgroups and wait for him to finish */
    if (lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP))
        return -1;



    /* Set the label to change to when we exec(2) the container’s init /
    if (!strcmp(lsm_name(), “AppArmor”))
        lsm_label = handler->conf->lsm_aa_profile;
    else if (!strcmp(lsm_name(), “SELinux”))
        lsm_label = handler->conf->lsm_se_context;
    if (lsm_process_label_set(lsm_label, 1, 1) < 0)
        goto out_warn_father;
/
Some init’s such as busybox will set sane tty settings on stdin,
     * stdout, stderr which it thinks is the console. We already set them
     * the way we wanted on the real terminal, and we want init to do its
     * setup on its console ie. the pty allocated in lxc_console_create()
     * so make sure that that pty is stdin,stdout,stderr.
     */
    if (lxc_console_set_stdfds(handler) < 0)
        goto out_warn_father;



    /* If we mounted a temporary proc, then unmount it now */
tmp_proc_unmount(handler->conf);
if (lxc_seccomp_load(handler->conf) != 0)
        goto out_warn_father;



    if (run_lxc_hooks(handler->name, “start”, handler->conf, handler->lxcpath, NULL)) {
        ERROR(“failed to run start hooks for container ‘%s’.”, handler->name);
        goto out_warn_father;
    }
/* The clearenv() and putenv() calls have been moved here
     * to allow us to use environment variables passed to the various
     * hooks, such as the start hook above.  Not all of the
     * variables like CONFIG_PATH or ROOTFS are valid in this
     * context but others are. /
    if (clearenv()) {
        SYSERROR(“failed to clear environment”);
        /
don’t error out though */
    }    



    if (putenv(“container=lxc”)) {
        SYSERROR(“failed to set environment variable”);
        goto out_warn_father;
    }    



    close(handler->sigfd);



    /* after this call, we are in error because this
     * ops should not return as it execs /
    handler->ops->start(handler, handler->data); //看怎么跳回去
handler在lxc_start 跳到__lxc_start 的时候就给ops的start 赋值函数指针start了,因此直接跳到start函数中。
static int start(struct lxc_handler *handler, void
data)
{
    struct start_args *arg = data;



    NOTICE(“exec’ing ‘%s’”, arg->argv[0]);



    execvp(arg->argv[0], arg->argv);
    SYSERROR(“failed to exec %s”, arg->argv[0]);
    return 0;
}
这里面开始执行容器的rootfs下面的第一个启动选项,default_args是/sbin/init,可以在start的时候指定。
Ok 从clone中回到 lxc_spawn这个中看后面怎么执行的。
if (attach_ns(saved_ns_fd))
        WARN(“failed to restore saved namespaces”);



    lxc_sync_fini_child(handler);
//一些cgroup的配置,将对用的namespace写入cgroup中
    if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
        failed_before_rename = 1;
if (!cgroup_create_legacy(handler)) {
        ERROR(“failed to setup the legacy cgroups for %s”, name);
        goto out_delete_net;
    }
    if (!cgroup_setup_limits(handler, false)) {
        ERROR(“failed to setup the cgroup limits for ‘%s’”, name);
        goto out_delete_net;
    }
        
    if (!cgroup_enter(handler))
        goto out_delete_net;
    
    if (!cgroup_chown(handler)) 
        goto out_delete_net;
    
    if (failed_before_rename)
        goto out_delete_net;
//网络配置
/* Create the network configuration */
    if (handler->clone_flags & CLONE_NEWNET) {
        if (lxc_assign_network(&handler->conf->network, handler->pid)) {
            ERROR(“failed to create the configured network”);
            goto out_delete_net;
        }
    }



    if (netpipe != -1) {
        struct lxc_list iterator;
        struct lxc_netdev *netdev;
    
        close(netpipe);
        lxc_list_for_each(iterator, &handler->conf->network) {
            netdev = iterator->elem;
            if (netdev->type != LXC_NET_VETH)
                continue;
            if (write(netpipepair[1], netdev->name, IFNAMSIZ) != IFNAMSIZ) {
                ERROR(“Error writing veth name to container”);
                goto out_delete_net;
            }
        }
        close(netpipepair[1]);
}
/
map the container uids - the container became an invalid
     * userid the moment it was cloned with CLONE_NEWUSER - this
     * call doesn’t change anything immediately, but allows the
     * container to setuid(0) (0 being mapped to something else on
     * the host) later to become a valid uid again */
    if (lxc_map_ids(&handler->conf->id_map, handler->pid)) {
        ERROR(“failed to set up id mapping”);
        goto out_delete_net;
    }



    /* Tell the child to continue its initialization.  we’ll get
     * LXC_SYNC_CGROUP when it is ready for us to setup cgroups
     */
    if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE))
        goto out_delete_net;



    if (!cgroup_setup_limits(handler, true)) {
        ERROR(“failed to setup the devices cgroup for ‘%s’”, name);
        goto out_delete_net;
    }



    cgroup_disconnect();
    cgroups_connected = false;
/* Tell the child to complete its initialization and wait for
     * it to exec or return an error.  (the child will never
     * return LXC_SYNC_POST_CGROUP+1.  It will either close the
     * sync pipe, causing lxc_sync_barrier_child to return
     * success, or return a different value, causing us to error
     * out).
     */
    if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CGROUP))
        return -1;



    if (detect_shared_rootfs())
        umount2(handler->conf->rootfs.mount, MNT_DETACH);



    if (handler->ops->post_start(handler, handler->data)) //post_start什么也没干,就赋值,直接return了
        goto out_abort;



    if (lxc_set_state(name, handler, RUNNING)) {
        ERROR(“failed to set state to %s”,
                  lxc_state2str(RUNNING));
        goto out_abort;
    }



    lxc_sync_fini(handler);



    return 0;
现在spaw函数结束,很多细节都没看,里面每个地方估计都要看很久,先把握下流程吧。
netnsfd = get_netns_fd(handler->pid);



    err = lxc_poll(name, handler);
    if (err) {
        ERROR(“mainloop exited with an error”);
        if (netnsfd >= 0)
            close(netnsfd);
        goto out_abort;
    }
又来了一个poll函数
看看poll做了什么
调用epoll_create用于通信
lxc_mainloop_open(&descr)
注册各种epoll事件
lxc_mainloop_add_handler(&descr, sigfd, signal_handler, &pid)
lxc_console_mainloop_add(&descr, handler)
lxc_command_mainloop_add(name, &descr, handler)
if (handler->conf->need_utmp_watch) lxc_utmp_mainloop_add(&descr, handler)
最后返回的时候又调用了lxc_mainloop(&descr)
while (waitpid(handler->pid, &status, 0) < 0 && errno == EINTR)
        continue;



    /*
     * If the child process exited but was not signaled,
     * it didn’t call reboot.  This should mean it was an
     * lxc-execute which simply exited.  In any case, treat
     * it as a ‘halt’
     /
        if (WIFSIGNALED(status)) {
        switch(WTERMSIG(status)) {
        case SIGINT: /
halt /
            DEBUG(“Container halting”);
            break;
        case SIGHUP: /
reboot /
            DEBUG(“Container rebooting”);
            handler->conf->reboot = 1;
            break;
        case SIGSYS: /
seccomp */
            DEBUG(“Container violated its seccomp policy”);
            break;
        default:
            DEBUG(“unknown exit status for init: %d”, WTERMSIG(status));
            break;
        }
        }



    lxc_rename_phys_nics_on_shutdown(netnsfd, handler->conf);
    if (netnsfd >= 0)
        close(netnsfd);



    if (handler->pinfd >= 0) {
        close(handler->pinfd);
        handler->pinfd = -1;
}



    lxc_monitor_send_exit_code(name, status, handler->lxcpath);
    err =  lxc_error_set_and_log(handler->pid, status);
}



1、首先就是第一个lxc_check_inherited函数



dir = opendir(“/proc/self/fd”);



    if (!dir) {



        WARN(“failed to opendirectory: %m”);



        return -1;



}



此函数是根据配置将/proc/self/fd下,关闭fd。



然后就跳到__lxc_start中



2、看下lxc-init



在init中 设置一些关于LXC_XXX的环境变量,猜测用于后面的使用。



可以再lxc启动的时候加一些脚本。



会在hook中先执行pre-start的前缀的脚本



if (run_lxc_hooks(name, “pre-start”, conf,handler->lxcpath, NULL)) {



        ERROR(“failed to runpre-start hooks for container ‘%s’.”, name);



        goto out_aborting;



}



继续,后面有调用lxc_create_tty,细致研究发现,这个函数是根据conf中设置tty的个数,通过opentty函数来创建pts给容器使用。



ret = openpty(&pty_info->master, &pty_info->slave,pty_info->name,NULL, NULL);



这个可以再config文件中设置tty的个数



tty的作用是,如果容器配置了根文件系统和inittab文件设置启动gettty,同时在inittab中gettty的个数不能超过设置的tty的个数,否则会出问题



同理 lxc_console_create 也是一样



如果容器配置了根文件系统和inittab文件设置使用控制台,您可能希望指定该控制台的输出。可以在config中设置lxc.console.logfile来指定输出的位置,lxc.console指定console的个数



然后通过ttys_shift_ids来设置tty的owner。



这样init的初始化过程就结束了。



3、然后到must_drop_cap_sys_boot(handler->conf)这个步骤中。



这个函数会读系统中/proc/sys/kernel/ctrl-alt-del这个文件,判断确定cmd的命令,cmd = v ?LINUX_REBOOT_CMD_CAD_ON : LINUX_REBOOT_CMD_CAD_OFF;



然后会系统调用clone,其中函数指针为container_reboot_supported,最终会调用reboot这个函数,



通过man reboot可以看到细节



LINUX_REBOOT_CMD_CAD_OFF



             (RB_DISABLE_CAD,  0).   CAD is  disabled.   This means  that  the CAD keystroke will cause a SIGINT signalto be sent to init



              (process 1),whereupon this process may decide upon a proper action (maybe: kill allprocesses, sync, reboot).



 



       LINUX_REBOOT_CMD_CAD_ON



              (RB_ENABLE_CAD,0x89abcdef).  CAD is enabled.  This means that the CAD keystroke willimmediately cause the  action  associated



              withLINUX_REBOOT_CMD_RESTART.



那么,问题来了,到底reboot什么东西,系统?还是container?一个已经启动,一个正在start过程。










暂时还没搞懂,是不是NEWPID NEWUSER 启动的新的namespace的空间中的东西,可能发SIGINT信号给主机的init的进程。将以前启动的container剩余的部分重新启动?先mark一下。


4、然后判断if (geteuid() == 0&& !lxc_list_empty(&conf->id_map)),id_map是空的,因为目前所有的的流程,都是以privilegecontainer说的,所有非root的用户就不分析了。



检查rootfs_is_blockdev(conf) 感觉函数是在判断rootfs的路径是否为blockdev,然后remount_all_slave打开/proc/self/mountinfo然后将shared enties 改变到slave中,就看当前的系统有没有share entries了。



然后调用do_rootfs_setup(conf, name,lxcpath) 将container rootfs 挂载上去。同时也通过pre-mount的脚本将自定义的一些mount 加进去,因此,这个地方也可以自己自定义,复用一些东西










然后调用setup_rootfs,先是调用mount(“”,”/”, NULL, MS_SLAVE MS_REC, 0),mount /,调用bdev_init,初始化rootfs。


5、然后进去lxc-spawn这个函数中,在别的地方很多次见到spawn这个函数,只知道spawn的英文意思是产卵的意思。这个函数上次分析,里面有很多事在做。



首先将以前的cloneflag 保存,记得start的刚开始初始化的时候如果没设置,ns_info中都设置默认的-1,然后就是同步handler,没什么好说的。



然后就是讲handler的clone_flags设置CLONE_NEWXXX,获取物理网络,等等设置一堆东西, 然后就要想办法将cgroup与namespace联系到一块了,到cgroup_init里面看看是什么流程。



首先,前面一直迷惑的ops怎么被初始化的问题,



attribute((constructor))



void cgroup_ops_init(void)



这个结构,在函数未调用之前就被执行了,这个回头会在杂篇中讲到,首先程序会根据系统中是否有cgmanager 来使用不同的初始化函数,本文就默认没有cgmanager,调用通用的cgfs_ops_init;返回一个引用值,返回静态变量cgfs_ops;将一些指针赋值,ok,看cgroup_init初始化过程,init指向cgfs_init,因此到cgfs_init这个函数中看一下



首先初始化cgfs_data的数据结构,然后设置cgroup_pattern为全局变量中lxc.cgroup.pattern即在编译中的DEFAULT_CGROUP_PATTERN,默认的是/lxc/%n,这个暂时不知道含义。继续看



然后调用lxc_cgroup_load_meta加载metadata,函数中会判断cgroup的使用情况,然后会调用lxc_cgroup_load_meta2的函数,会查找子系统的白名单,或者指定的hierarchies。



最终返回给handler->cgroup_data。



然后调用cgroup_create(handler)来创建cgroup,调用ops的create,create的指针指向cgfs_create,是个内联函数,最终调用lxc_cgroupfs_create,lxc_cgroupfs_create(d->name,d->cgroup_pattern, md, NULL)用来创建new cgroup



/* we will modify the result of this operation directly,



     * so we don’t have to copythe data structure



     */



   base_info = (path_pattern[0]== ‘/’) ?



    lxc_cgroup_process_info_get_init(meta_data) :    //pattern为/lxc/%n



     lxc_cgroup_process_info_get_self(meta_data);



    if (!base_info)



        return NULL;



其中get_init为returnlxc_cgroup_process_info_get(1, meta);pid 为1号进程get数据,根据/proc/1/cgroup中的信息添加到cgroup_process_info的链表中。



new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1,sizeof(char *));



    if (!new_cgroup_paths)



        goto out_initial_error;



 



    new_cgroup_paths_sub =calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));



    if (!new_cgroup_paths_sub)



        goto out_initial_error;



分配空间



/* find mount points we can use */



    for (info_ptr = base_info;info_ptr; info_ptr = info_ptr->next) {



        h =info_ptr->hierarchy;



        mp =lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);



        if (!mp) {



            ERROR(“Could notfind writable mount point for cgroup hierarchy %d while trying to createcgroup.”, h->index);



            gotoout_initial_error;



        }



        info_ptr->designated_mount_point= mp;



 



        if(lxc_string_in_array(“ns”, (const char **)h->subsystems))



            continue;



        if(handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {



            ERROR(“Could notset clone_children to 1 for cpuset hierarchy in parent cgroup.”);



            gotoout_initial_error;



        }



}



/* normalize the path */



    cgroup_path_components =lxc_normalize_path(path_pattern);



    if (!cgroup_path_components)



        goto out_initial_error;



来看主要的find_name_on_this_level程序块



/* determine name of the path component we should create */



        if (contains_name&& suffix > 0) {



            char *buf =calloc(strlen(name) + 32, 1);



            if (!buf)



                gotoout_initial_error;



            snprintf(buf, strlen(name)+ 32, “%s-%u”, name, suffix);



            current_component =lxc_string_replace(“%n”, buf, p_eff);



            free(buf);



        } else {



            current_component =contains_name ? lxc_string_replace(“%n”, name, p_eff) : p_eff;



        }



        parts[0] = path_so_far;



        parts[1] =current_component;



        parts[2] = NULL;



        current_subpath =path_so_far ? lxc_string_join(“/”, (const char **)parts, false) :current_component;



/* Now go through each hierarchy and try to create the



         * corresponding cgroup



         */



其中最主要的是



r = create_cgroup(info_ptr->designated_mount_point,current_entire_path);来创建cgroup的目录层级。



理一下头绪,cgroup通过cgroup.patternd 的模式,然后读取/proc/1/cgroup下去创建相应的cgroup层级,最后创建cgroup的目录。



6、回到lxc-spawn中,然后到通过一些网络的netpipepair设置,这些都不是我们关心的。



最后调用lxc_clone函数调用do_start来对container进行一系列的初始化操作,首先是lxc_setup 前面也介绍了,通过初始化,mount rootfs,网络,autodev,自动挂载/proc,/sys等文件,然后设置tty,console等设置标准输入输出的位置,等等。



然后可以设置if(run_lxc_hooks(handler->name, “start”, handler->conf,handler->lxcpath, NULL)) start脚本来辅助工作,这个也是可以自定义的内容



最后在do_start函数中调用handler->ops->start(handler,handler->data);



ops为lxc的operation中的内容,来看看想干嘛。execvp(arg->argv[0],arg->argv);执行start container了,这里面,我们用到的是/init不是默认的/sbin/init,因为我们的容器不是标准的容器,所以这点是不同的。



里面注释也谈到了,当我们执行这个/init的时候,函数就不会返回来了,那么后面的程序怎么办?



所以在do_start中子进程一直等到父进程完成工作和配置。



/* Tell the parent task it can begin to configure the



     * container and wait for itto finish



     */



    if(lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))



        return -1;



然后父进程进行一系列的配置,其中最主要的就是cgroup的配置,如果容器没有cgroup的话,资源划分就成问题了,



cgroup_setup_limits 资源限制,cgroup_enter将pid进程加入task任务中,等等设置cgroup



然后还是配置网络,将container加入到veth当中,这当年还是要看自己config网络相关的配置,so,网络配置有很多,就忽略网络的问题了。



然后又告诉子进程继续初始化过程



/* Tell the child to continue its initialization.  we’ll get



     * LXC_SYNC_CGROUP when it isready for us to setup cgroups



     */



    if(lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE))



        goto out_delete_net;



然后当子进程setup过程完成之后,让父进程设置cgroup,同时父进程设置完cgroup时,也通知子进程完成,此时子进程就真正进入到container的init的进程了。



一直没发现这个LXC_SYNC_POST_CGROUPwait 子进程的信号谁发给他,这个比较疑惑?



最后发现是do_stat这个函数if判断失败后goto的,则表示中间会error,最后还有个post_cgroup,注释是这样说道。



/* Tell the child to complete its initialization and wait for



     * it to exec or return anerror.  (the child will never



     * returnLXC_SYNC_POST_CGROUP+1.  It will eitherclose the



     * sync pipe, causinglxc_sync_barrier_child to return



     * success, or return adifferent value, causing us to error



     * out).



     */



    if(lxc_sync_barrier_child(handler, LXC_SYNC_POST_CGROUP))



        return -1;



然后就是调用post-start,NOTICE 运行的pid,最后设置container的状态为RUNNING,至此spawn就结束了。



回到__lxc_start中,get_netns_fd获得network的状态,然后进入lxc_poll中.后面没什么好说的,现在主要考虑lxc 在exec container的init的进程过后,lxc是如何继续接管程序的。



lxc start部分的源码的大致工作流程已经熟悉,那么就要关注他的核心内容了,就是关于namespace 和 cgroup的内容了。



根据前面的分析已经知道,lxc根据一些配置会自动将flag设置成CLONE_NEWXXX,然后会通过cgroup init 来初始化一堆 cgroup。我们先来看一下。



首先通过cgroup_create 来创建 cgroup,前面介绍都是有个ops 指向函数指针,这里先假设我们用的cgfs,理论上应该和cgroupmanager是一样的方式,可能细节有区别而已。



那么顺理成章create指向cgfs_create,后面就直接说函数指针的位置了。



函数内部通过调用lxc_cgroupfs_create。那么就要从create a newcgroup



static struct cgroup_process_info lxc_cgroupfs_create(const charname, const char path_pattern, struct cgroup_meta_data *meta_data, const charsub_pattern)



 



char**cgroup_path_components = NULL;



    char **p = NULL;



    char *path_so_far = NULL;



    char **new_cgroup_paths =NULL;



    char **new_cgroup_paths_sub =NULL;



    struct cgroup_mount_point*mp;



    struct cgroup_hierarchy *h;



    struct cgroup_process_info*base_info = NULL;



    struct cgroup_process_info*info_ptr;



    int saved_errno;



    int r;



    unsigned suffix = 0;



    bool had_sub_pattern = false;



size_t i;



 



if (!is_valid_cgroup(name)){                                      //判断name 是否有效



        ERROR(“Invalidcgroup name: ‘%s’”, name);



        errno = EINVAL;



        return NULL;



}



 



if (!strstr(path_pattern,”%n”)) {



        ERROR(“Invalidcgroup path pattern: ‘%s’; contains no %%n for specifying container name”,path_pattern);



        errno = EINVAL;



        return NULL;



}



根据privilege 和unprivilege  container的不同读取到proc 下面的pid的不同来确定不同的cgroup 信息。



base_info = (path_pattern[0]== ‘/’) ?



       lxc_cgroup_process_info_get_init(meta_data) :



       lxc_cgroup_process_info_get_self(meta_data);



    if (!base_info)



        return NULL;



new_cgroup_paths =calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));



    if (!new_cgroup_paths)



        goto out_initial_error;



 



在自己机子上面,看到的cgroup:



gudh@lxc-D3F2-CM:~$ cat/proc/self/cgroup



11:name=systemd:/user/1004.user/5.session



10:hugetlb:/user/1004.user/5.session



9:perf_event:/user/1004.user/5.session



8:blkio:/user/1004.user/5.session



7:freezer:/user/1004.user/5.session



6:devices:/user/1004.user/5.session



5:memory:/user/1004.user/5.session



4:cpuacct:/user/1004.user/5.session



3:cpu:/user/1004.user/5.session



2:cpuset:/user/1004.user/5.session



gudh@lxc-D3F2-CM:~$ id



uid=1004(gudh)gid=1004(gudh) groups=1004(gudh),0(root),4(adm)



gudh@lxc-D3F2-CM:~$ cat/proc/1/cgroup



11:name=systemd:/



10:hugetlb:/



9:perf_event:/



8:blkio:/



7:freezer:/



6:devices:/



5:memory:/



4:cpuacct:/



3:cpu:/



2:cpuset:/



然后就是分配path的大小



    new_cgroup_paths_sub =calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));



    if (!new_cgroup_paths_sub)



        goto out_initial_error;



 



查找可以挂载的点,然后创建。



/* find mount points we canuse */



    for (info_ptr = base_info; info_ptr;info_ptr = info_ptr->next) {



        h = info_ptr->hierarchy;



        mp = lxc_cgroup_find_mount_point(h,info_ptr->cgroup_path, true);



        if (!mp) {



            ERROR(“Could not find writablemount point for cgroup hierarchy %d while trying to create cgroup.”,h->index);



            goto out_initial_error;



        }



        info_ptr->designated_mount_point =mp;



 



        if (lxc_string_in_array(“ns”,(const char **)h->subsystems))



            continue;



        if (handle_cgroup_settings(mp,info_ptr->cgroup_path) < 0) {



            ERROR(“Could not setclone_children to 1 for cpuset hierarchy in parent cgroup.”);



            goto out_initial_error;



        }



    }



 



cgroup_path_components = lxc_normalize_path(path_pattern);



    if (!cgroup_path_components)



        goto out_initial_error;



 



然后根据normalize的path去创建他们。



/* go through the pathcomponents to see if we can create them */











    for (p = cgroup_path_components; *p   (sub_pattern && !had_sub_pattern); p++) {


        /* we only want to create the samecomponent with -1, -2, etc.



         * if the component contains thecontainer name itself, otherwise



         * it’s not an error if it alreadyexists



         */



        char p_eff = *p ? *p : (char)sub_pattern;



        bool contains_name = strstr(p_eff,”%n”);



        char *current_component = NULL;



        char *current_subpath = NULL;



        char *current_entire_path = NULL;



        char *parts[3];



        size_t j = 0;



        i = 0;



 



       /* if we are processing the subpattern, we want to make sure



         * loop is ended the next time around



         */



        if (!*p) {



            had_sub_pattern = true;



            p–;



        }



 



然后就到find_name_on_this_level,这里面pattern 应该是/lxc/%n



 



        goto find_name_on_this_level;



find_name_on_this_level:



        /* determine name of the path componentwe should create */



        if (contains_name && suffix> 0) {



            char *buf = calloc(strlen(name) +32, 1);



            if (!buf)



               goto out_initial_error;



            snprintf(buf, strlen(name) + 32,”%s-%u”, name, suffix);



            current_component =lxc_string_replace(“%n”, buf, p_eff);



            free(buf);



        } else {



            current_component = contains_name ?lxc_string_replace(“%n”, name, p_eff) : p_eff;



        }



        parts[0] = path_so_far;



        parts[1] = current_component;



        parts[2] = NULL;



        current_subpath = path_so_far ?lxc_string_join(“/”, (const char **)parts, false) :current_component;



紧接着创建相应的cgroup



for (i = 0, info_ptr =base_info; info_ptr; info_ptr = info_ptr->next, i++) {



            char *parts2[3];



 



            if(lxc_string_in_array(“ns”, (const char**)info_ptr->hierarchy->subsystems))



                continue;



            current_entire_path = NULL;



 



            parts2[0] =!strcmp(info_ptr->cgroup_path, “/”) ? “” :info_ptr->cgroup_path;



            parts2[1] = current_subpath;



            parts2[2] = NULL;



            current_entire_path = lxc_string_join(“/”,(const char **)parts2, false);



 



            if (!*p) {



                /* we are processing thesubpath, so only update that one */



                free(new_cgroup_paths_sub[i]);



                new_cgroup_paths_sub[i] =strdup(current_entire_path);



                if (!new_cgroup_paths_sub[i])



                    goto cleanup_from_error;



            } else {



                /* remember which path was usedon this controller */



                free(new_cgroup_paths[i]);



                new_cgroup_paths[i] =strdup(current_entire_path);



                if (!new_cgroup_paths[i])



                    goto cleanup_from_error;



            }



 



            r =create_cgroup(info_ptr->designated_mount_point, current_entire_path);



这样就完成相应的代码设置。



对于pattern 为/lxc/%n 就分两次不同创建在相应的目录,这样cgroup subpath 也同时受到顶层/lxc 的控制,cgroup就成功创建了。



 



       然后就到cgroup_create_legacy最终调用lxc_cgroup_create_legacy



直接看注释



/*  



     * if cgroup is mounted at/cgroup and task is in cgroup /ab/, pid 2375 and



     * name is c1,



     * dir: /ab



     * fulloldpath =/cgroup/ab/2375



     * fullnewpath =/cgroup/ab/c1



     * newname = /ab/c1



     */



如果老名字为/sys/cgroup/cpu/lxc/android/2375



那么就改成/sys/cgroup/cpu/lxc/android/android?



加入cgroup一些创建file的 capability



 



cgroup_setup_limits 名字很明显设置限额 with_device是false



将在config中加入的device.allow 和device.deny 配置



手动设置的地方



 



然后就是cgfs_enter 最后到lxc_cgroupfs_enter



lxc_cgroup_find_mount_point 查找path下面的mount point



cgroup_to_absolute_path absolute path



lxc_write_to_file然后将pid写入到cgroup的absolutepath下面



这样就将pid 与cgroup成功绑定。
cgroup_chown chown的指针目前是NULL 暂时不分析
后面又来了一次 cgroup_setup_limits 这是with_device 是true
此时应该就完成了cgroup的相关设置


Category docker