Changes the apparent root directory of a process. Process inside chroot can't access the rest of the filesystem tree.
# create chroot and subdirs
root@w540:/var/tmp# mkdir testchroot
root@w540:/var/tmp# mkdir testchroot/{bin,lib64}
# copy binaries
root@w540:/var/tmp# cd testchroot/bin/
root@w540:/var/tmp/testchroot/bin# cp /bin/ls .
root@w540:/var/tmp/testchroot/bin# cp /bin/bash .
# identify libraries
root@w540:/var/tmp/testchroot# ldd /bin/bash
linux-vdso.so.1 (0x00007ffed07f0000)
libtinfo.so.5 => /lib/x86_64-linux-gnu/libtinfo.so.5 (0x00007f7fb4bbf000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f7fb49bb000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f7fb45ca000)
/lib64/ld-linux-x86-64.so.2 (0x00007f7fb5103000)
# copy ALL required libraries for bash and ls
# create a new file
root@w540:/var/tmp/testchroot# echo "hello" > test.txt
# move into the chrooted environment - chroot NEWROOT [COMMAND]
root@w540:/var/tmp/testchroot# chroot /var/tmp/testchroot/ /bin/bash
bash-4.4# ls
bin lib lib64 test.txt
bash-4.4# cat test.txt
bash: cat: command not found
bash-4.4#
# ls is available, but cat isn't
bash-4.4# pwd
/
bash-4.4# cd ..
bash-4.4# pwd
/
# I'm in the root directory
chroot can be used in sshd with the ChrootDirectory directive as the action when Matching a group of users.
Linux kernel feature.
namespace limits the ability of a process to see a system resource. (c groups limit what you can acess)
There are six (6) linux namespaces: User / IPC / UTS / Mount / Network / PID
The namespaces are per process and can be listed in /proc/
root@twickenham:/home/# ps aux | grep "[s]shd -D"
root 4514 0.0 0.0 15852 7272 ? Ss 20:58 0:00 /usr/sbin/sshd -D
root@twickenham:/home/# ll /proc/4514/ns/
total 0
lrwxrwxrwx 1 root root 0 Jan 19 21:37 cgroup -> 'cgroup:[4026531835]'
lrwxrwxrwx 1 root root 0 Jan 19 21:37 ipc -> 'ipc:[4026531839]'
lrwxrwxrwx 1 root root 0 Jan 19 21:37 mnt -> 'mnt:[4026531840]'
lrwxrwxrwx 1 root root 0 Jan 19 21:37 net -> 'net:[4026531992]'
lrwxrwxrwx 1 root root 0 Jan 19 21:37 pid -> 'pid:[4026531836]'
lrwxrwxrwx 1 root root 0 Jan 19 21:37 pid_for_children -> 'pid:[4026531836]'
lrwxrwxrwx 1 root root 0 Jan 19 21:37 user -> 'user:[4026531837]'
lrwxrwxrwx 1 root root 0 Jan 19 21:37 uts -> 'uts:[4026531838]'
All the processes pointing to the same inode are considered to be in the same namespace.
Adding a new network namespace:
root@twickenham:/home/# ip netns add sample1
root@twickenham:/home/# ip netns list
sample1
Check iptables for my default namespace:
root@twickenham:/home/# iptables -L DOCKER-ISOLATION-STAGE-2
Chain DOCKER-ISOLATION-STAGE-2 (3 references)
target prot opt source destination
DROP all -- anywhere anywhere
DROP all -- anywhere anywhere
DROP all -- anywhere anywhere
RETURN all -- anywhere anywhere
Now.. if I try to list the same chain on my newly created namespace called sample1, I get:
root@twickenham:/home/# ip netns exec sample1 iptables -L DOCKER-ISOLATION-STAGE-2
iptables: No chain/target/match by that name.
The change is more apparent if I start a bash process instead of simply running iptables:
root@twickenham:/home/# ip netns exec sample1 bash
root@twickenham:/home/# echo $BASHPID
4840
For this new bash, the net namespace is:
root@twickenham:/home/# ll /proc/4840/ns/net
lrwxrwxrwx 1 root root 0 Jan 19 22:19 /proc/4840/ns/net -> 'net:[4026532685]'
and for a different bash:
root@twickenham:/home/# ll /proc/${BASHPID}/ns/net
lrwxrwxrwx 1 root root 0 Jan 19 22:20 /proc/4834/ns/net -> 'net:[4026531992]'
Cgroups are a Linux kernel feature that limits, accounts for, and isolates the resource usage (CPU, memory, disk I/O, network, etc.) of a collection of processes.
Subsystems:
For example, this is how the freezer system looks for the default cgroup (all processes)
root@twickenham:~# cat /sys/fs/cgroup/freezer/cgroup.procs | wc -l
193
For a container:
root@twickenham:~# docker container ls
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
69f525bf26f5 debian "/bin/bash" 8 weeks ago Up 2 hours testdebian
root@twickenham:~# cat /sys/fs/cgroup/freezer/docker/69f525bf26f5f4c16ab631ac96a16c9acbc5b5f69a924f63b4fe4fd16fd96657/cgroup.procs
1840
1939
1940
root@twickenham:~# ps aux | grep -e 1840 -e 1939 -e 1940 root 1840 0.0 0.0 4032 3356 pts/0 Ss+ 15:30 0:00 /bin/bash
root 1939 0.0 0.0 13084 1048 ? Ss 15:41 0:00 nginx: master process nginx
systemd+ 1940 0.0 0.0 13508 2692 ? S 15:41 0:00 nginx: worker process
root 2077 0.0 0.0 6076 840 pts/0 S+ 17:22 0:00 grep -e 1840 -e 1939 -e 1940
# Inside the container
root@69f525bf26f5:/# ps aux
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1 0.0 0.0 4032 3352 pts/0 Ss 14:30 0:00 /bin/bash
root 10 0.0 0.0 13084 1048 ? Ss 14:41 0:00 nginx: master process
nginx 11 0.0 0.0 13508 2692 ? S 14:41 0:00 nginx: worker process
root 12 0.0 0.0 7648 2788 pts/0 R+ 16:10 0:00 ps aux
Released by IBM in 2008, LXC (lexy) combines chroot, kernel namespaces, SELinux policies and apparmor to provide a containerized environment.
From the official website: "Our main focus is system containers. That is, containers which offer an environment as close as possible as the one you'd get from a VM but without the overhead that comes with running a separate kernel and simulating all the hardware."
The main components are lxd and lxd-client.
Examples
# Installation
cloud_user@ip-10-0-1-50:~$ sudo apt-get install lxd lxd-client
# Initialization
cloud_user@ip-10-0-1-50:~$ sudo lxd init
Do you want to configure a new storage pool (yes/no) [default=yes]?
Name of the storage backend to use (dir or zfs) [default=dir]:
Would you like LXD to be available over the network (yes/no) [default=no]?
Do you want to configure the LXD bridge (yes/no) [default=yes]?
Warning: Stopping lxd.service, but it can still be activated by:
lxd.socket
LXD has been successfully configured.
cloud_user@ip-10-0-1-50:~$
# New lxdbr0 bridge with random subnet was created
cloud_user@ip-10-0-1-50:~$ ip -4 addr
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9001 qdisc pfifo_fast state UP group default qlen 1000
inet 10.0.1.50/24 brd 10.0.1.255 scope global eth0
valid_lft forever preferred_lft forever
5: lxdbr0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
inet 10.218.29.1/24 scope global lxdbr0
valid_lft forever preferred_lft forever
# Sources for images
cloud_user@ip-10-0-1-50:~$ sudo lxc remote list
+-----------------+------------------------------------------+---------------+--------+--------+
| NAME | URL | PROTOCOL | PUBLIC | STATIC |
+-----------------+------------------------------------------+---------------+--------+--------+
| images | https://images.linuxcontainers.org | simplestreams | YES | NO |
+-----------------+------------------------------------------+---------------+--------+--------+
| local (default) | unix:// | lxd | NO | YES |
+-----------------+------------------------------------------+---------------+--------+--------+
| ubuntu | https://cloud-images.ubuntu.com/releases | simplestreams | YES | YES |
+-----------------+------------------------------------------+---------------+--------+--------+
| ubuntu-daily | https://cloud-images.ubuntu.com/daily | simplestreams | YES | YES |
+-----------------+------------------------------------------+---------------+--------+--------+
# List of available images on one remote
cloud_user@ip-10-0-1-50:~$ sudo lxc image list ubuntu-daily: | head
+--------------------+--------------+--------+-----------------------------------------------+---------+----------+-------------------------------+
| ALIAS | FINGERPRINT | PUBLIC | DESCRIPTION | ARCH | SIZE | UPLOAD DATE |
+--------------------+--------------+--------+-----------------------------------------------+---------+----------+-------------------------------+
| arm64 (5 more) | 65ac61467b6d | yes | ubuntu 18.04 LTS arm64 (daily) (20200618) | aarch64 | 166.42MB | Jun 18, 2020 at 12:00am (UTC) |
+--------------------+--------------+--------+-----------------------------------------------+---------+----------+-------------------------------+
| armhf (5 more) | ca3e71f57c99 | yes | ubuntu 18.04 LTS armhf (daily) (20200618) | armv7l | 164.92MB | Jun 18, 2020 at 12:00am (UTC) |
+--------------------+--------------+--------+-----------------------------------------------+---------+----------+-------------------------------+
| b (11 more) | 1be5b44a55b7 | yes | ubuntu 18.04 LTS amd64 (daily) (20200618) | x86_64 | 179.28MB | Jun 18, 2020 at 12:00am (UTC) |
+--------------------+--------------+--------+-----------------------------------------------+---------+----------+-------------------------------+
| e (5 more) | 648aef59e416 | yes | ubuntu 19.10 amd64 (daily) (20200611) | x86_64 | 341.62MB | Jun 11, 2020 at 12:00am (UTC) |
# Launch a new image
cloud_user@ip-10-0-1-50:~$ sudo lxc launch images:60ee6bef5d7d my-alpine
Creating my-alpine
Starting my-alpine
# List status
cloud_user@ip-10-0-1-50:~$ sudo lxc image list
+-------+--------------+--------+------------------------------------+--------+--------+------------------------------+
| ALIAS | FINGERPRINT | PUBLIC | DESCRIPTION | ARCH | SIZE | UPLOAD DATE |
+-------+--------------+--------+------------------------------------+--------+--------+------------------------------+
| | 60ee6bef5d7d | no | Alpine 3.10 amd64 (20200621_13:00) | x86_64 | 2.40MB | Jun 21, 2020 at 6:03pm (UTC) |
+-------+--------------+--------+------------------------------------+--------+--------+------------------------------+
cloud_user@ip-10-0-1-50:~$ sudo lxc list
+-----------+---------+----------------------+------+------------+-----------+
| NAME | STATE | IPV4 | IPV6 | TYPE | SNAPSHOTS |
+-----------+---------+----------------------+------+------------+-----------+
| my-alpine | RUNNING | 10.218.29.137 (eth0) | | PERSISTENT | 0 |
+-----------+---------+----------------------+------+------------+-----------+
# Connect to the alpine image, and list interfaces
cloud_user@ip-10-0-1-50:~$ sudo lxc exec my-alpine -- /bin/ash
~ # ip -4 addr
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN qlen 1
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
6: eth0@if7: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 1500 qdisc noqueue state UP qlen 1000
inet 10.218.29.137/24 brd 10.218.29.255 scope global eth0
valid_lft forever preferred_lft forever
~ #
Docker started as a python script on 2008, by 2012 it grew to a hundred independent microservices and it became a open source project on 2013.
In its infancy, Docker was just a wrapper around LXC, with additional functions. Some of those funcionalities:
The core is dockerd, it includes a REST API that could be invoked directly or with the docker cli. The "free" version is called docker-ce (enterprise is docker-ee)
Images are divided into different layers. Docker uses the copy-on-write concept (COW) which means that we only create a copy of an object when we want to modify it.
[root@ip-10-0-1-100 cloud_user]# docker image pull alpine:latest
latest: Pulling from library/alpine
df20fa9351a1: Pull complete
Digest: sha256:185518070891758909c9f839cf4ca393ee977ac378609f700f60a771a2dfe321
Status: Downloaded newer image for alpine:latest
docker.io/library/alpine:latest
# a24bb4013296 is the image in our system
[root@ip-10-0-1-100 cloud_user]# docker history alpine
IMAGE CREATED CREATED BY SIZE COMMENT
a24bb4013296 3 weeks ago /bin/sh -c #(nop) CMD ["/bin/sh"] 0B
<missing> 3 weeks ago /bin/sh -c #(nop) ADD file:c92c248239f8c7b9b… 5.57MB
[root@ip-10-0-1-100 cloud_user]# docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
alpine latest a24bb4013296 3 weeks ago 5.57MB
The Dockerfile used to created the alpine image is available here: https://hub.docker.com/_/alpine and hosted on github:
FROM scratch
ADD alpine-minirootfs-3.12.0-x86_64.tar.gz /
CMD ["/bin/sh"]
This is a very simple image, that builds on top of the docker reserved minimal image "scratch" https://hub.docker.com/_/scratch Scratch is used to build base images (like debian or busybox) or a super minimal image that just copies a beinary and runs it (like hello world)
More complex, multi-layer images, like httpd (the apache docker container) builds on top of a debian image, where it runs a series of commands to install dependencies and packages:
FROM debian:buster-slim
ENV HTTPD_PREFIX /usr/local/apache2
ENV PATH $HTTPD_PREFIX/bin:$PATH
RUN mkdir -p "$HTTPD_PREFIX" && chown www-data:www-data "$HTTPD_PREFIX"
WORKDIR $HTTPD_PREFIX
[...]
COPY httpd-foreground /usr/local/bin/
EXPOSE 80
CMD ["httpd-foreground"]
We can see all the layers when we pull the image:
[root@ip-10-0-1-100 cloud_user]# docker image pull httpd
Using default tag: latest
latest: Pulling from library/httpd
8559a31e96f4: Pull complete
bd517d441028: Pull complete
f67007e59c3c: Pull complete
83c578481926: Pull complete
f3cbcb88690d: Pull complete
Digest: sha256:387f896f9b6867c7fa543f7d1a686b0ebe777ed13f6f11efc8b94bec743a1e51
Status: Downloaded newer image for httpd:latest
docker.io/library/httpd:latest
[root@ip-10-0-1-100 cloud_user]# docker image history httpd
IMAGE CREATED CREATED BY SIZE COMMENT
ccbcea8a6757 12 days ago /bin/sh -c #(nop) CMD ["httpd-foreground"] 0B
<missing> 12 days ago /bin/sh -c #(nop) EXPOSE 80 0B
<missing> 12 days ago /bin/sh -c #(nop) COPY file:c432ff61c4993ecd… 138B
<missing> 12 days ago /bin/sh -c #(nop) STOPSIGNAL SIGWINCH 0B
<missing> 12 days ago /bin/sh -c set -eux; savedAptMark="$(apt-m… 60.9MB
<missing> 12 days ago /bin/sh -c #(nop) ENV HTTPD_PATCHES= 0B
<missing> 12 days ago /bin/sh -c #(nop) ENV HTTPD_SHA256=a497652a… 0B
<missing> 12 days ago /bin/sh -c #(nop) ENV HTTPD_VERSION=2.4.43 0B
<missing> 12 days ago /bin/sh -c set -eux; apt-get update; apt-g… 35.4MB
<missing> 12 days ago /bin/sh -c #(nop) WORKDIR /usr/local/apache2 0B
<missing> 12 days ago /bin/sh -c mkdir -p "$HTTPD_PREFIX" && chow… 0B
<missing> 12 days ago /bin/sh -c #(nop) ENV PATH=/usr/local/apach… 0B
<missing> 12 days ago /bin/sh -c #(nop) ENV HTTPD_PREFIX=/usr/loc… 0B
<missing> 12 days ago /bin/sh -c #(nop) CMD ["bash"] 0B
<missing> 12 days ago /bin/sh -c #(nop) ADD file:4d35f6c8bbbe6801c… 69.2MB
Docker Swarm
Enables us to take multiple hosts and pulls them together into a swarm. A manager node will spread the load between multiple worker nodes.
Enable swarm on manager node
root@pzolo1c:~# docker swarm init --advertise-addr 172.31.25.177
Swarm initialized: current node (i5pzauemeje1pfcdznqr3vroe) is now a manager.
To add a worker to this swarm, run the following command:
docker swarm join --token SWMTKN-1-56p1ly19vbhpejakwnb4p3ooom2cfeen4s1jb8w84tu5fhnen4-7iq8usdv0fkw687k8vmk48f7f 172.31.25.177:2377
To add a manager to this swarm, run 'docker swarm join-token manager' and follow the instructions.
On the workers:
root@pzolo3c:~# docker swarm join --token SWMTKN-1-56p1ly19vbhpejakwnb4p3ooom2cfeen4s1jb8w84tu5fhnen4-7iq8usdv0fkw687k8vmk48f7f 172.31.25.177:2377
This node joined a swarm as a worker.
List swarm status on master:
root@pzolo1c:~# docker node ls
ID HOSTNAME STATUS AVAILABILITY MANAGER STATUS ENGINE VERSION
i5pzauemeje1pfcdznqr3vroe * pzolo1c.mylabserver.com Ready Active Leader 19.03.11
rudihv22bbfwnvbo6gmfg6c0d pzolo2c.mylabserver.com Ready Active 19.03.11
xp2n4759amxb1eq91lpw2bc7s pzolo3c.mylabserver.com Ready Active 19.03.11
Let's create a service with 2 replicas of the nginx container.
root@pzolo1c:~# docker service create --replicas 2 -p 80:80 --name myweb nginx
kzqy1s69xlfc3nl2xd60z4hbx
overall progress: 2 out of 2 tasks
1/2: running [==================================================>]
2/2: running [==================================================>]
verify: Service converged
The manager node assigns the tasks based on the number of replicas.
root@pzolo1c:~# docker service ls
ID NAME MODE REPLICAS IMAGE PORTS
kzqy1s69xlfc myweb replicated 2/2 nginx:latest *:80->80/tcp
On the worker:
root@pzolo2c:~# docker container ls
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
ad9a9d8aa4c8 nginx:latest "/docker-entrypoint.…" 3 minutes ago Up 3 minutes 80/tcp myweb.2.ggqhv3fa4fdcd2m8hgi3z40en
Details of the service. One instance is running in the manager:
root@pzolo1c:~# docker service ps myweb
ID NAME IMAGE NODE DESIRED STATE CURRENT STATE ERROR PORTS
igq2zwielyr8 myweb.1 nginx:latest pzolo1c.mylabserver.com Running Running 5 minutes ago
ggqhv3fa4fdc myweb.2 nginx:latest pzolo2c.mylabserver.com Running Running 5 minutes ago
Even if the service is only running on 2 nodes, it should be accesible from ANY working node:
root@pzolo3c:~# curl localhost:80 -v -so /dev/null
* Rebuilt URL to: localhost:80/
* Trying 127.0.0.1...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 80 (#0)
> GET / HTTP/1.1
> Host: localhost
> User-Agent: curl/7.52.1
> Accept: */*
>
< HTTP/1.1 200 OK
< Server: nginx/1.19.0
< Date: Sun, 21 Jun 2020 21:04:27 GMT
< Content-Type: text/html
< Content-Length: 612
< Last-Modified: Tue, 26 May 2020 15:00:20 GMT
< Connection: keep-alive
< ETag: "5ecd2f04-264"
< Accept-Ranges: bytes
<
{ [612 bytes data]
* Curl_http_done: called premature == 0
* Connection #0 to host localhost left intact
In case one of the workers fails, the manager will move the running container to another worker in order to maintain the number of required repolicas. For example, if we stop the docker service on worker2:
root@pzolo1c:~# docker service ps myweb
ID NAME IMAGE NODE DESIRED STATE CURRENT STATE ERROR PORTS
igq2zwielyr8 myweb.1 nginx:latest pzolo1c.mylabserver.com Running Running 20 minutes ago
isk181g1xc6z myweb.2 nginx:latest pzolo3c.mylabserver.com Running Running 2 minutes ago
ggqhv3fa4fdc \_ myweb.2 nginx:latest pzolo2c.mylabserver.com Shutdown Running 20 minutes ago