diff options
author | neal <neal> | 2003-09-07 21:49:29 +0000 |
---|---|---|
committer | neal <neal> | 2003-09-07 21:49:29 +0000 |
commit | 103f3951ba2e4ae7c1c39ad9c7d71a1df23586c9 (patch) | |
tree | 664e015e7626d3ad633b789a46c8ec5aeaab2077 | |
parent | b90d3c97c9ef64621f42e40258af9c571624f99f (diff) |
/
2003-09-07 Neal H. Walfield <neal@cs.uml.edu>
* configure.ac: Check for latex, dvips and ps2pdf. Bail if not
found. Generate doc/Makefile.
/doc/
Modularize the document by breaking each chapter into its own tex
file. Integrate into the build system.
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | configure.ac | 21 | ||||
-rw-r--r-- | doc/Makefile.am | 54 | ||||
-rw-r--r-- | doc/authentication.tex | 158 | ||||
-rw-r--r-- | doc/booting.tex | 271 | ||||
-rw-r--r-- | doc/debugging.tex | 10 | ||||
-rw-r--r-- | doc/device-drivers.tex | 422 | ||||
-rw-r--r-- | doc/hurd-on-l4.tex | 2708 | ||||
-rw-r--r-- | doc/introduction.tex | 44 | ||||
-rw-r--r-- | doc/ipc.tex | 1126 | ||||
-rw-r--r-- | doc/posix.tex | 403 | ||||
-rw-r--r-- | doc/threads-tasks.tex | 235 | ||||
-rw-r--r-- | doc/vmm.tex | 26 |
13 files changed, 2785 insertions, 2698 deletions
@@ -1,3 +1,8 @@ +2003-09-07 Neal H. Walfield <neal@cs.uml.edu> + + * configure.ac: Check for latex, dvips and ps2pdf. Bail if not + found. Generate doc/Makefile. + 2003-07-26 Marcus Brinkmann <marcus@gnu.org> * Initial check-in. diff --git a/configure.ac b/configure.ac index 15e4228..b0051a5 100644 --- a/configure.ac +++ b/configure.ac @@ -34,6 +34,20 @@ AC_PROG_CC AM_PROG_AS AC_PROG_RANLIB +# Required for building the documentation +AC_PATH_PROG([LATEX], [latex], no) +if test "x$LATEX" = xno; then + missing_progs="$missing_progs latex" +fi +AC_PATH_PROG([DVIPS], [dvips], no) +if test "x$DVIPS" = xno; then + missing_progs="$missing_progs dvips" +fi +AC_PATH_PROG([PS2PDF], [ps2pdf], no) +if test "x$PS2PDF" = xno; then + missing_progs="$missing_progs ps2pdf" +fi + # Checks for libraries. # Checks for header files. @@ -56,10 +70,15 @@ esac m4_include([libhurd-slab/headers.m4]) m4_include([libhurd-ihash/headers.m4]) +if test "x$missing_progs" != "x"; then + AC_MSG_ERROR([The following programs were not found:$missing_progs]) +fi + # Checks for library functions. AC_CONFIG_FILES([Makefile laden/Makefile libl4/ia32/Makefile libl4/Makefile libhurd-slab/Makefile - libhurd-ihash/Makefile]) + libhurd-ihash/Makefile + doc/Makefile]) AC_OUTPUT diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 0000000..5ee392b --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,54 @@ +# Makefile.am - Makefile template for the manual. +# Copyright (C) 2003 Free Software Foundation, Inc. +# Written by Neal H. Walfield +# +# This file is part of the GNU Hurd. +# +# The GNU Hurd is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# The GNU Hurd is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + +pkgdata_DATA = hurd-on-l4.dvi hurd-on-l4.ps hurd-on-l4.pdf + +# _DEPENDENCIES does not work with _DATA. +hurd-on-l4.dvi: \ + hurd-on-l4.tex \ + introduction.tex \ + booting.tex \ + ipc.tex \ + threads-tasks.tex \ + vmm.tex \ + authentication.tex \ + posix.tex \ + debugging.tex \ + device-drivers.tex + +SUFFIXES = .ps .dvi .tex + +.tex.dvi: + $(LATEX) $< -o $@ + while egrep "^LaTeX Warning:.*Rerun to" \ + `echo $< | sed -e 's/.tex$$/.log/'`; do \ + rm -f $(subst $<,.tex,.log); \ + $(LATEX) $< -o $@; \ + done + +.dvi.ps: + $(DVIPS) $< + +.ps.pdf: + $(PS2PDF) $< + +clean: + rm -f *.aux *.ps *.dvi *.pdf *.log *.toc + diff --git a/doc/authentication.tex b/doc/authentication.tex new file mode 100644 index 0000000..817afa9 --- /dev/null +++ b/doc/authentication.tex @@ -0,0 +1,158 @@ +\chapter{Authentication} +\label{auth} + +Capabilities are a good way to give access to protected objects and +services. They are flexible, lightweight and generic. However, Unix +traditionally uses access control lists (ACL) to restrict access to +objects like files. Any task running with a certain user ID can +access all files that are readable for the user with that user ID. +Although all objects are implemented as capabilities in the Hurd, the +Hurd also supports the use of user IDs for access control. + +The system authentication server \texttt{auth} implements the Unix +authentication scheme using capabilities. It provides auth +capabilities, which are associated with a list of effective and +available user and group IDs. The holder of such a capability can use +it to authenticate itself to other servers, using the protocol below. + +Of course, these other servers must use (and trust) the same +\texttt{auth} server as the user. Otherwise, the authentication will +fail. Once a capability is authenticated in the server, the server +will know the user IDs of the client, and can use them to validate +further operations. + +The \texttt{auth} server provides two types of capabilities: + +\subsubsection{Auth capabilities} +An auth capability is associated with four vectors of IDs: The +effective user and group IDs, which should be used by other servers to +authenticate operations that require certain user or group IDs, and +the available user and group IDs. Available IDs should not be used +for authentication purposes, but can be turned into effective IDs by +the holder of an auth capability at any time. + +New auth capabilities can be created from existing auth capabilities, +but only if the requested IDs are a subsets from the union of the +(effective and available) IDs in the provided auth capabilities. If +an auth capability has an effective or available user ID 0, then +arbitrary new auth objects can be created from that. + +\subsubsection{Passport capabilities} +A passport capability can be created from an auth capability and is +only valid for the task that created it. It can be provided to a +server in an authentication process (see below). For the client, the +passport capability does not directly implement any useful operation. +For the server, it can be used to verify the identity of a user and +read out the effective user and group IDs. + +The auth server should always create new passport objects for +different tasks, even if the underlying auth object is the same, so +that a task having the passport capability can not spy on other tasks +unless they were given the passport capability by that task. + +\section{Authenticating a client to a server} + +A client can authenticate itself to a server with the following +protocol: + +\subsubsection{Preconditions} +The client $C$ has an auth capability implemented by the \texttt{auth} +server $A$. It also has a capability implemented by the server $S$. +It wants to reauthenticate this capability with the auth capability, +so the server associates the new user and group IDs with it. + +The server also has an auth capability implemented by its trusted +\texttt{auth} server. For the reauthentication to succeed, the +\texttt{auth} server of the client and the server must be identical. +If this is the case, the participating tasks hold task info caps for +all other participating tasks (because of the capabilities they hold). + +\begin{enumerate} +\item The client $C$ requests the passport capability for itself from + the auth capability from $A$. + + \begin{comment} + Normally, the client will request the passport capability only + once and store it together with the auth capability. + \end{comment} + +\item The \texttt{auth} server receives the request and creates a new + passport capability for this auth capability and this client. The + passport capability is returned to the user. + +\item The user receives the reply from the \texttt{auth} server. + + It then sends the reauthentication request to the server $S$, which + is invoked on the capability the client wants to reauthenticate. It + provides the passport capability as an argument. + +\item The server $S$ can accept the passport capability, if it + verifies that it is really implemented by the \texttt{auth} server + it trusts. If the client does not provide a passport capability to + the trusted \texttt{auth} server, the authentication process is + aborted with an error. + + Now the server can send a request to the \texttt{auth} server to + validate the passport capability. The RPC is invoked on the + passport capability. + +\item The \texttt{auth} server receives the validation request on the + passport capability and returns the task ID of the client $C$ that + this passport belongs to, and the effective user and group IDs for + the auth cap to which this passport cap belongs. + + \begin{comment} + The Hurd on Mach returned the available IDs as well. This feature + is not used anywhere in the Hurd, and as the available IDs should + not be used for authentication anyway, this does not seem to be + useful. If it is needed, it can be added in an extended version + of the validation RPC. + \end{comment} + +\item The server receives the task ID and the effective user and group + IDs. The server now verifies that the task ID is the same as the + task ID of the sender of the reauthentication request. Only then + was the reauthentication request made by the owner of the auth cap. + It can then return a new capability authenticated with the new user + and group IDs. + + \begin{comment} + The verification of the client's task ID is necessary. As the + passport cap is copied to other tasks, it can not serve as a proof + of identity alone. It is of course absolutely crucial that the + server holds the task info cap for the client task $C$ for the + whole time of the protocol. But the same is actually true for any + RPC, as the server needs to be sure that the reply message is sent + to the sender thread (and not any imposter). + \end{comment} + +\item The client receives the reply with the new, reauthenticated + capability. Usually this capability is associated in the server + with the same abstract object, but different user credentials. + + \begin{comment} + Of course a new capability must be created. Otherwise, all other + users holding the same capability would be affected as well. + \end{comment} + + The client can now deallocate the passport cap. + + \begin{comment} + As said before, normally the passport cap is cached by the client + for other reauthentications. + \end{comment} +\end{enumerate} + +\subsubsection{Result} +The client $C$ has a new capability that is authenticated with the new +effective user and group IDs. The server has obtained the effective +user and group IDs from the \texttt{auth} server it trusts. + +\begin{comment} + The Hurd on Mach uses a different protocol, which is more complex + and is vulnerable to DoS attacks. The above protocol can not + readily be used on Mach, because the sender task of a message can + not be easily identified. +\end{comment} + + diff --git a/doc/booting.tex b/doc/booting.tex new file mode 100644 index 0000000..8d39fcd --- /dev/null +++ b/doc/booting.tex @@ -0,0 +1,271 @@ +\chapter{Booting} + +A multiboot-compliant bootloader, for example GNU GRUB, loads the +loader program \texttt{laden}, the kernel, $\sigma_0$, the rootserver +and further modules. The loader is started, patches the kernel +interface page, and starts the kernel. The kernel starts $\sigma_0$ +and the rootserver. The rootserver has to deal with the other +modules. + + +\section{System bootstrap} + +The initial part of the boot procedure is system specific. + + +\subsection{Booting the ia32} + +On the ia32, the BIOS will be one of the first things to run. +Eventually, the BIOS will start the bootloader. The Hurd requires a +multiboot-compliant bootloader, such as GNU GRUB. A typical +configuration file entry in the \verb/menu.list/ file of GNU GRUB will +look like this: + +\begin{verbatim} +title = The GNU Hurd on L4 +root = (hd0,0) +kernel = /boot/laden +module = /boot/ia32-kernel +module = /boot/sigma0 +module = /boot/rootserver +module = ...more servers... +\end{verbatim} + +\begin{comment} + The name of the rootserver and the further modules are not specified + yet. +\end{comment} + +GNU GRUB loads the binary image files into memory and jumps to the +entry point of \texttt{laden}. + + +\section{The loader \texttt{laden}} + +\texttt{laden} is a multiboot compliant kernel from the perspective of +GNU GRUB. It expects at least three modules. The first module is the +L4 kernel image, the second module is the $\sigma_0$ server image, and +the third module is the rootserver image. + +\begin{comment} + Later, the L4 kernel will support the optional UTCB paging server + $\sigma_1$, which has to be treated like the other initial servers + by \texttt{laden}. A command line option to \texttt{laden} will + allow the user to specify if the third module is the rootserver or + $\sigma_1$. If $\sigma_1$ is used, the rootserver is the fourth + module in the list. +\end{comment} + +\texttt{laden} copies (or moves) the three executable images to the +right location in memory, according to their respective ELF headers. +It also initializes the BSS section to zero. + +\begin{comment} + Laden has to deal with overlapping source and destination memory + areas in an intelligent way. It currently will detect such + situations, but is not always able to find a solution, even if one + exists. + + If a memory area stretches out to the very last page addressible in + 32 bit, the high address of the memory descriptor will overflow. + This is in fact the behaviour of \texttt{kickstart}. \texttt{laden} + currently truncates such an area by one page. This needs + clarification in the L4 standard. +\end{comment} + +Then it searches for the kernel interface page (KIP) in the L4 kernel +image and modifies it in the following way: + +\begin{itemize} +\item The memory descriptors are filled in according to the memory + layout of the system. On ia32, this information is -- at least + partially -- provided by GNU GRUB. + + \begin{comment} + GNU GRUB seems to omit information about the memory that is shared + with the VGA card. \texttt{laden} creates a special entry for + that region, overriding any previous memory descriptor. + \end{comment} + +\item The start and end addresses and the entry point of the initial + servers are filled in. + + \begin{comment} + A future version of L4 should support adding information about the + UTCB area of the initial rootserver as well. Until then, the + rootserver has no clean way to create a new thread (a hack is used + by the rootserver to calculate the UTCB addresses for other + threads). + \end{comment} + +\item The \verb/boot_info/ field is initialized. + + \begin{comment} + The \verb/boot_info/ field is currently set to the GNU GRUB + \verb/multiboot_info/ structure. This only works for the ia32 + architecture of course. We might want to have a more architecture + independent way to pass the information about further modules to + the rootserver. We also might want to gather the information + provided by GNU GRUB in a single page (if it is not). + \end{comment} +\end{itemize} + + +\section{The L4 kernel} + +The L4 kernel initializes itself and then creates the address spaces +and threads for the initial servers $\sigma_0$ and the rootserver. It +maps all physical memory idempotently into $\sigma_0$, and sets the +pager of the rootserver thread to $\sigma_0$. Then it starts the +initial servers. + + +\section{The initial server $\sigma_0$} + +$\sigma_0$ acts as the pager for the rootserver, answering page fault +messages by mapping the page at the fault address idempotently in the +rootserver. + +\begin{comment} + $\sigma_0$ can also be used directly by sending messages to it, + according to the $\sigma_0$ RPC protocol. This is used by the kernel + to allocate reserved memory, but can also be used by the user to + explicitely allocate more memory than single pages indirectly via + page faults. +\end{comment} + +The thread ID of $\sigma_0$ is (\verb/UserBase, 1)/. + +\begin{comment} + We will write all thread IDs in the form (\verb/thread nr/, + \verb/version/). +\end{comment} + +Any fpage will only be provided to one thread. $\sigma_0$ will return +an error if another thread attempts to map or manipulate an fpage that +has already been given to some other thread, even if both threads +reside in the same address space. + + +\section{The initial server $\sigma_1$} + +$\sigma_1$ is intended to provide a paging service for UTCB memory. +This will allow orthogonal persistence to be implemented. It is not +yet supported. + +The thread ID of $\sigma_1$ is (\verb/UserBase + 1, 1)/. + + +\section{The rootserver} +\label{rootserver} + +The rootserver is the only task in the system which threads can +perform privileged system calls. So the rootserver must provide +wrappers for the system calls to other unprivileged system tasks. + +\begin{comment} + For this, a simple authentication scheme is required. The + rootserver can keep a small, statically allocated table of threads + which are granted access to the system call wrappers. The caller + could provide the index in the table for fast O(1) lookup instead + linear search. Threads with access could be allowed to add other + threads or change existing table entries. The same scheme can be + used in the device driver framework. + + The rootserver should have one thread per CPU, and run at a high + priority. +\end{comment} + +The rootserver has the following initial state: + +\begin{itemize} +\item Its thread ID is (\verb/UserBase + 2/, 1). + +\item The priority is set to the 255, the maximum value. + + \begin{comment} + The rootserver, or at least the system call wrapper, should run at + a very high priority. + \end{comment} + +\item The instruction pointer \verb/%eip/ is set to the entry point, +all other registers are undefined (including the stack pointer). + +\item The pager is set to $\sigma_0$. + +\item The exception handler set to \verb/nilthread/. + +\item The scheduler is set to the rootserver thread itself. +\end{itemize} + +So the first thing the rootserver has to do is to set up a simple +stack. + +Then the rootserver should evaluate the \verb/boot_info/ field in the +KIP to find the information about the other modules. It should parse +the information and create the desired initial tasks of the operating +system. The Hurd uses a boot script syntax to allow to pass +information about other initial tasks and the root tasks to each +initial task in a generalized manner. + +\begin{comment} + The exact number and type of initial tasks necessary to boot the + Hurd are not yet known. Chances are that this list includes the + \texttt{task} server, the physical memory server, the device + servers, and the boot filesystem. The boot filesystem might be a + small simple filesystem, which also includes the device drivers + needed to access the real root filesystem. +\end{comment} + + +\section{The physical memory server} + +To be written. + +\begin{comment} + In fact, I already have some ideas. Here they are: + + The rootserver copies (or moves) the physical memory server + executable image to the right location in memory, according to its + respective ELF header. It also initializes the BSS section to zero. + + Then it follows the \texttt{exec()} protocol to startup the new + task. This should be done as transparently as possible. All pages + the rootserver provides because of page faults should be granted. + The rootserver waits for the physical memory server to contact the + rootserver thread. Then the following startup protocol is walked + through: + + \begin{enumerate} + \item The physical memory server requests all system memory from the + rootserver. The rootserver maps the memory from $\sigma_0$ and + grants it to the physical memory server. Alternatively, the + physical memory server might get the memory directly from + $\sigma_0$, but it should ask the rootserver for the amount and + location of memory to get. + + \item For each module that has not been used yet, the rootserver + requests a capability in the physical memory server that can be + used to map in pages from the range of memory that the module + occupies. These capabilities should implement the same pager + interface that mappable files implement. + + The idea is that these capabilities can be used in the + \texttt{exec()} protocol to start up the tasks for these modules. + If a module is not a task, the capability can be used to access + the module data by mapping it into the address space like a file. + The physical memory server can even swap out pages that back these + objects on memory pressure. + + So, the physical memory server is in fact a simple filesystem for + these initial tasks, usable only for mapping operations. + + \item The rootserver can then start up the other tasks in the module + list using the normal \texttt{exec()} protocol. + \end{enumerate} + + The result is that all tasks except for the rootserver can be + started like normal Hurd tasks, and can also be swapped out. +\end{comment} + + diff --git a/doc/debugging.tex b/doc/debugging.tex new file mode 100644 index 0000000..23f8230 --- /dev/null +++ b/doc/debugging.tex @@ -0,0 +1,10 @@ +\chapter{Debugging} +\label{debug} + +L4 does not support debugging. So every task has to implement a debug +interface and implement debugging locally. gdb needs to be changed to +make use of this interface. How to perform the required +authentication, and how the debug thread is advertised to gdb, and how +the debug interface should look like, are all open questions. + + diff --git a/doc/device-drivers.tex b/doc/device-drivers.tex new file mode 100644 index 0000000..b7e3215 --- /dev/null +++ b/doc/device-drivers.tex @@ -0,0 +1,422 @@ +\chapter{Device Drivers} + +This section written by Peter De Schrijver and Daniel Wagner. + +\section{Requirements} + + \begin{itemize} + \item Performance: Speed is important! + \item Portability: Framework should work on different architectures. + + Also: Useable in a not hurdisch environment with only + small changes. + + \item Flexibility + \item Convenient interfaces + \item Consistency + \item Safety: driver failure should have as minimal system impact as + possible. + \end{itemize} + +\section{Overview} + + The framework consists of: + \begin{itemize} + \item Bus drivers + \item Device drivers + \item Service servers (plugin managers, $\omega_0$, rootserver) + \end{itemize} + +\subsection{Drivers and the filesystem} + + The device driver framework will only offer a physical device view. + Ie. it will be a tree with devices as the leaves connected by + various bus technologies. Any logical view and naming persistence + will have to be build on top of this (translator). + +\subsection{Layer of the drivers} + + The device driver framework consists only of the lower level drivers + and doesn't need to have a complicated scheme for access control. + This is because it should be possible to share devices, e.g. for + neighbour Hurd. The authentication is done by installing a virtual + driver in each OS/neighour Hurd. The driver framework trusts these + virtual drivers. So it's possible for a non Hurdish system to use + the driver framework just by implementing these virtual drivers. + + Only threads which have registered as trusted are allowed to access + device drivers. The check is simply done by checking the senders + ID against a table of known threads. + +\subsection{Address spaces} + + Drivers always reside in their own AS. The overhead for cross AS IPC + is small enough to do so. + +\subsection{Zero copying and DMA} + + It is assumed that there are no differences between physical memory + pages. For example each physical memory page can be used for DMA + transfers. Of course, older hardware like ISA devices can so not be + supported. Who cares? + + With this assumption, the device driver framework can be given any + physical memory page for DMA operation. This physical memory page + must be pinned down. + + If an application wants to send or receive data to/from a device + driver it has to tell the virtual driver the page on which the + operation has to be executed. Since the application doesn't know + the virtual-real memory mapping, it has to ask the physical memory + manager for the real memory address of the page in question. If the + page is not directly mapped from the physical memory manager the + application ask the mapper (another application which has mapped + this memory region the first application) to resolve the mapping. + This can be done recursively. Normally, this resolving of mapping + can be speed up using a cache services, since a small number of + pages are reused very often. + + With the scheme, the drivers do not have to take special care of + zero copying if there is only one virtual driver. When there is + more than one virtual driver pages have to copied for all other + virtual drivers. + +\subsection{Root bus driver} + + The root bus is the entrypoint to look up devices. + + XXX There should be iterators/visitors for operating on + busses/devices. (daniel) + +\subsection{Physical versus logical device view} + + The device driver framework will only offer a physical device view. + Ie. it will be a tree with devices as the leaves connected by + various bus technologies. Any logical view and naming persistence + will have to be build on top of this (translator). + +\subsection{Things for the future} + + \begin{itemize} + \item Interaction with the task server (e.g. listings driver threads + with ps,etc.) + \item Powermanagement + \end{itemize} + +\section{Bus Drivers} + +A bus driver is responsible to manage the bus and provide access to +devices connected to it. In practice it means a bus driver has to +perform the following tasks: + +\begin{itemize} +\item Handle hotplug events + + Busses which do not support hotplugging, will treated as if there is + 1 insertion event for every device connected to it when the bus + driver is started. Drivers which don't support autoprobing of + devices will probably have to read some configuration data from a + file or if the driver is a needed for bootstrapping configuration + can be given as argument on its stack. In some cases the bus + doesn't generate insertion/removal events, but can still support + some form of hotplug functionality if the user tells the driver when + a change to the bus configuration has happened (eg. SCSI). + +\item Configure client device drivers + + The bus driver should start the appropriate client device driver + translator when an insertion event is detected. It should also + provide the client device driver with all necessary configuration + info, so it can access the device it needs. This configuration data + typically consists of the bus addresses of the device and possibly + IRQ numbers or DMA channel ID's. The device driver is loaded by the + assotiatet plugin manager. + +\item Provide access to devices + + This means the bus driver should be able to perform a bus + transaction on behalf of a client device driver. In some cases this + involves sending a message and waiting for reply (eg. SCSI, USB, + IEEE 1394, Fibre Channel,...). The driver should provide + send/receive message primitives in this case. In other cases + devices on the bus can be accessed by doing a memory accesses or by + using special I/O instructions. In this case the driver should + provide mapping and unmapping primitives so a client device driver + can get access to the memory range or is allowed to access the I/O + addresses. The client device driver should use a library, which is + bus dependant, to access the device on the bus. This library hides + the platform specific details of accessing the bus. + + Furthermore the bus driver must also support rescans for hardware. + It might be that not all drivers are found during bootstrapping and + hence later on drivers could be loaded. This is done by regenerate + new attach notification sending to bus's plugin manager. The plugin + manager loads then if possible a new driver. A probe funtion is not + needed since all supported hardware can be identified by + vendor/device identifactions (unlike ISA hardware). For hardware + busses which don't support such identifaction (ISA) only static + configuration is possible (configuration scripts etc.) +\end{itemize} + + +\subsection{Plugin Manager} + + Each bus driver has a handle/reference to which insert/remove events + are send. The owner of the handle/refence must then take + appropriate action like loading the drivers. These actors are + called plugin managers. + +\subsection{Generic Bus Driver} + + Operations: + \begin{itemize} + \item notify (attach, detach) + \item string enumerate + \end{itemize} + + XXX Extract generic bus services from the PCI Bus Driver section + which could be also be used other PCI related busses (ISA) be used. + The name for this service is missleading, since a SCSI Bus Driver + does not have anything in common with a PCI bus. (daniel) + +\subsection{ISA Bus Driver} +Inherits from: + +\begin{itemize} +\item Generic Bus Driver +\end{itemize} + +Operations: +\begin{itemize} +\item (none) +\end{itemize} + +XXX The interface has not been defined up to now. (daniel) + + +\subsection{PCI Bus Driver} + +Inherits from: +\begin{itemize} +\item Generic Bus Driver +\end{itemize} + +Operations: +\begin{itemize} +\item map\_mmio: map a PCI BAR for MMIO +\item map\_io: map a PCI BAR for I/O +\item map\_mem: map a PCI BAR for memory +\item read\_mmio\_{8,16,32,64}: read from a MMIO register +\item write\_mmio\_{8,16,32,64}: write to a MMIO register +\item read\_io\_{8,16,32,64}: read from an IO register +\item write\_io\_{8,16,32,64}: write to an IO register +\item read\_config\_{8,16,32,?}: read from a PCI config register +\item write\_config\_{8,16,32,?}: write to a PCI config register +\item alloc\_dma\_mem(for non zero copying): allocate main memory useable for DMA +\item free\_dma\_mem (for non zero copying): free main memory useable for DMA +\item prepare\_dma\_read: write back CPU cachelines for DMAable memory area +\item sync\_dma\_write: discard CPU cachelines for DMAable memory area +\item alloc\_consistent\_mem: allocate memory which is consistent between CPU + and device +\item free\_consistent\_mem: free memory which + is consistent between CPU and device +\item get\_irq\_mapping (A,B,C,D): get the IRQ matching the INT(A,B,C,D) line +\end{itemize} + +\section{Device Drivers} +\subsection{Classes} +\begin{itemize} +\item character: This the standard tty as known in the Unix environment. +\item block +\item human input: Keyboard, mouse, ... +\item packet switched network +\item circuit switched network +\item framebuffer +\item streaming audio +\item streaming video +\item solid state storage: flash memory +\end{itemize} + +\subsection{Human input devices (HID) and the console} + +The HIDs and the console are critical for user interaction with the +system. Furthmore, the console should be working as soons as possible +to give feedback. Log messages which are send to the console before +the hardware has been initialized should be buffered. + +\subsection{Generic Device Driver} +Operations: +\begin{itemize} +\item init : prepare hardware for use +\item start : start normal operation +\item stop : stop normal operation +\item deinit : shutdown hardware +\item change\_irq\_peer : change peer thread to propagate irq message to. +\end{itemize} + + +\subsection{ISA Devices} +Inherits from: +\begin{itemize} +\item Generic Device Driver +\end{itemize} + +Supported devices +\begin{itemize} +\item Keyboard (ps2) +\item serial port (mainly for debugging purposses) +\item parallel port +\end{itemize} + +XXX interface definition for each device driver is missing. (daniel) + + +\subsection{PCI Devices} +Inherits from: +\begin{itemize} +\item Generic Device Driver +\end{itemize} + +Supported devices: +\begin{itemize} +\item block devices +\item ... +\end{itemize} + +XXX interface definition for each device driver is missing. (daniel) + + +\section{Resource Management} + + +\subsection{IRQ handling} + +\subsubsection{IRQ based interrupt vectors} + +Some CPU architectures (eg 68k, IA32) can directly jump to an +interrupt vector depending on the IRQ number. This is typically the +case on CISC CPU's. In this case there is some priorization scheme. On +IA32 for example, the lowest IRQ number has the highest priority. +Sometimes the priorities are programmable. Most RISC CPU's have only +a few interrupt vectors which are connected external IRQs. (typically +1 or 2). This means the IRQ handler should read a register in the +interrupt controller to determine which IRQ handler has to be +executed. Sometimes the hardware assists here by providing a register +which indicates the highest priority interrupt according to some +(programmable) scheme. + +\subsubsection{IRQ acknowlegdement} + +The IRQ acknowledgement is done in two steps. First inform the +hardware about the successful IRQ acceptance. Then inform the ISRs +about the IRQ event. + +\subsubsection{Edge versus level triggered IRQs} + +Edge triggered IRQs typically don't need explicit acknowledgment by +the CPU at the device level. You can just acknowledge them at the +interrupt controller level. Level triggered IRQs typically need to +explicitly acknowledged by the CPU at the device level. The CPU has to +read or write a register from the IRQ generating peripheral to make +the IRQ go away. If this is not done, the IRQ handler will be +reentered immediatly after it ended, effectively creating an endless +loop. Another way of preventing this would be to mask the IRQ. + +\subsubsection{Multiple interrupt controllers} + +Some systems have multiple interrupt controllers in cascade. This is +for example the case on a PC, where you have 2 8259 interrupt +controllers. The second controller is connected to the IRQ 2 pin of +the first controller. It is also common in non PC systems which still +use some standard PC components such as a Super IO controller. In this +case the 2 8259's are connected to 1 pin of the primary interrupt +controller. Important for the software here is that you need to +acknowledge IRQ's at each controller. So to acknowledge an IRQ from +the second 8259 connected to the first 8259 connected to another +interrupt controller, you have to give an ACK command to each of those +controllers. Another import fact is that on PC architecture the order +of the ACKs is important. + +\subsubsection{Shared IRQs} + +Some systems have shared IRQs. In this case the IRQ handler has to +look at all devices using the same IRQ... + +\subsubsection{IRQ priorities} + +All IRQs on L4 have priorities, so if an IRQ occurs any IRQ lower then +the first IRQ will be blocked until the first IRQ has been +acknowlegded. ISR priorities must much the hardware priority (danger +of priority inversion). Furthermore the IRQ acknowledgment order is +important. + +The 8259 also supports a specific IRQ acknowledge iirc. But, this +scheme does not work in most level triggered IRQ environments. In +these environments you must acknowledge (or mask) the IRQ before +leaving the IRQ handler, otherwise the CPU will immediately reenter +the IRQ handler, effectively creating an endless loop. In this case L4 +would have to mask the IRQ. The IRQ thread would have to unmask it +after acknowledgement and processing. + +\subsubsection{IRQ handling by L4/x86} + +The L4 kernel does handle IRQ acknowlegdment. + + +\subsection{$\omega_0$} + +$\omega_0$ is a system-central IRQ-logic server. It runs in the +privileged AS space in order to be allowed rerouting IRQ IPC. + +If an IRQ is shared between several devices, the drivers are daisy +chained and have to notify their peers if an IRQ IPC has arrived. + +XXX For more detail see XXX URL missing + +Operations: +\begin{itemize} +\item attach\_irq : attach an ISR thread to the IRQ +\item detach\_irq : detach an ISR thread form the IRQ +\end{itemize} + + +\subsection{Memory} +If no physical memory pages are provided by the OS the device driver +framework alloces pages from the physical memory manager. The device +driver framework has at no point of time to handle any virtual to +physical page mapping. + + +\section{Bootstrapping} + +A simpleFS provides initial drivers for bootstraping. The root bus +driver and simpleFS is loaded by grub as module. It then signals for +loading new (bus) drivers. As before if there is no driver avaible +for some reason for the device, the bus driver doesn't change the +device state and waits for a notifaction that there are new drivers +avaible. This simpleFS might be based on BSD libstand (library for +standalone applications). simpleFS doesn't need to be writeable +either. + + +\subsection{Plugin Manager} +A Plugin manager handles driver loading for devices. It searches for +driver in seach pathes (on filesystems). It's possible to add new +search pathes later. This allows the system to bootstrap with only +one search path (the simpleFS). When the search path is changed, the +device tree will be scanned for devices which don't have a driver +loaded yet. If a driver has become available, it will be loaded. + + +\section{Order of implementation} + +\begin{enumerate} +\item rootserver, plugin server +\item root bus server +\item pci bus +\item isa bus +\item serial port (isa bus) +\item console +\end{enumerate} + + diff --git a/doc/hurd-on-l4.tex b/doc/hurd-on-l4.tex index b48f0e9..8d8ffaa 100644 --- a/doc/hurd-on-l4.tex +++ b/doc/hurd-on-l4.tex @@ -1,4 +1,4 @@ -\documentclass[9pt,a4paper]{extarticle} +\documentclass{book} %\usepackage{german} %\usepackage[margin=2.5cm]{geometry} @@ -9,2706 +9,20 @@ \date{August 2003} \begin{document} + \maketitle -\newpage \tableofcontents -\newpage \setlength{\parindent}{0pt} \setlength{\parskip}{1ex plus 0.5ex minus 0.2ex} -\section{Introduction} - -The GNU Hurd is a multi-server operating system running on top of a -microkernel (currently Mach variants). The core motivation of the -Hurd is the following: - -\begin{quote} - \emph{The operating system should enable its users to share the - resources of the system without harming each other.} -\end{quote} - -The focus is on the user, the system should try to allow the user to -do anything that is not harmful for other users. Many operating -systems either restrict what the user can do to be more secure, while -others allow the user to do everything, but fail on protecting the -users from each other effectively. - -The Hurd is designed to minimize the system code that the user is -required to use, while allowing the user to use, ignore or replace the -remaining system code, and this without harming other users. - -So while the L4 microkernel tries to minimize the policy that the -kernel enforces on the software running on it, the Hurd tries to -minimize the policy that the operating system enforces on its users. -Furthermore, the Hurd also aims to provide a POSIX compatible general -purpose operating system. However, this POSIX personality of the Hurd -is provided for convenience only, and to make the Hurd useful. Other -personalities can be implemented and used by the users of the system -along with the POSIX personality. This default personality of the -Hurd also provides some convenient features that allow the user to -extend the system so that all POSIX compatible programs can take -advantage of it. - -These notes are a moving target in the effort to find the best -strategy to port the Hurd to the L4 microkernel. - -\begin{comment} - Remarks about the history of a certain feature and implementation - details are set in a smaller font and separated from the main text, - just like this paragraph. Because this is work in progress, there - are naturally a lot of such comments. -\end{comment} - - -\section{Booting} - -A multiboot-compliant bootloader, for example GNU GRUB, loads the -loader program \texttt{laden}, the kernel, $\sigma_0$, the rootserver -and further modules. The loader is started, patches the kernel -interface page, and starts the kernel. The kernel starts $\sigma_0$ -and the rootserver. The rootserver has to deal with the other -modules. - - -\subsection{System bootstrap} - -The initial part of the boot procedure is system specific. - - -\subsubsection{Booting the ia32} - -On the ia32, the BIOS will be one of the first things to run. -Eventually, the BIOS will start the bootloader. The Hurd requires a -multiboot-compliant bootloader, such as GNU GRUB. A typical -configuration file entry in the \verb/menu.list/ file of GNU GRUB will -look like this: - -\begin{verbatim} -title = The GNU Hurd on L4 -root = (hd0,0) -kernel = /boot/laden -module = /boot/ia32-kernel -module = /boot/sigma0 -module = /boot/rootserver -module = ...more servers... -\end{verbatim} - -\begin{comment} - The name of the rootserver and the further modules are not specified - yet. -\end{comment} - -GNU GRUB loads the binary image files into memory and jumps to the -entry point of \texttt{laden}. - - -\subsection{The loader \texttt{laden}} - -\texttt{laden} is a multiboot compliant kernel from the perspective of -GNU GRUB. It expects at least three modules. The first module is the -L4 kernel image, the second module is the $\sigma_0$ server image, and -the third module is the rootserver image. - -\begin{comment} - Later, the L4 kernel will support the optional UTCB paging server - $\sigma_1$, which has to be treated like the other initial servers - by \texttt{laden}. A command line option to \texttt{laden} will - allow the user to specify if the third module is the rootserver or - $\sigma_1$. If $\sigma_1$ is used, the rootserver is the fourth - module in the list. -\end{comment} - -\texttt{laden} copies (or moves) the three executable images to the -right location in memory, according to their respective ELF headers. -It also initializes the BSS section to zero. - -\begin{comment} - Laden has to deal with overlapping source and destination memory - areas in an intelligent way. It currently will detect such - situations, but is not always able to find a solution, even if one - exists. - - If a memory area stretches out to the very last page addressible in - 32 bit, the high address of the memory descriptor will overflow. - This is in fact the behaviour of \texttt{kickstart}. \texttt{laden} - currently truncates such an area by one page. This needs - clarification in the L4 standard. -\end{comment} - -Then it searches for the kernel interface page (KIP) in the L4 kernel -image and modifies it in the following way: - -\begin{itemize} -\item The memory descriptors are filled in according to the memory - layout of the system. On ia32, this information is -- at least - partially -- provided by GNU GRUB. - - \begin{comment} - GNU GRUB seems to omit information about the memory that is shared - with the VGA card. \texttt{laden} creates a special entry for - that region, overriding any previous memory descriptor. - \end{comment} - -\item The start and end addresses and the entry point of the initial - servers are filled in. - - \begin{comment} - A future version of L4 should support adding information about the - UTCB area of the initial rootserver as well. Until then, the - rootserver has no clean way to create a new thread (a hack is used - by the rootserver to calculate the UTCB addresses for other - threads). - \end{comment} - -\item The \verb/boot_info/ field is initialized. - - \begin{comment} - The \verb/boot_info/ field is currently set to the GNU GRUB - \verb/multiboot_info/ structure. This only works for the ia32 - architecture of course. We might want to have a more architecture - independent way to pass the information about further modules to - the rootserver. We also might want to gather the information - provided by GNU GRUB in a single page (if it is not). - \end{comment} -\end{itemize} - - -\subsection{The L4 kernel} - -The L4 kernel initializes itself and then creates the address spaces -and threads for the initial servers $\sigma_0$ and the rootserver. It -maps all physical memory idempotently into $\sigma_0$, and sets the -pager of the rootserver thread to $\sigma_0$. Then it starts the -initial servers. - - -\subsection{The initial server $\sigma_0$} - -$\sigma_0$ acts as the pager for the rootserver, answering page fault -messages by mapping the page at the fault address idempotently in the -rootserver. - -\begin{comment} - $\sigma_0$ can also be used directly by sending messages to it, - according to the $\sigma_0$ RPC protocol. This is used by the kernel - to allocate reserved memory, but can also be used by the user to - explicitely allocate more memory than single pages indirectly via - page faults. -\end{comment} - -The thread ID of $\sigma_0$ is (\verb/UserBase, 1)/. - -\begin{comment} - We will write all thread IDs in the form (\verb/thread nr/, - \verb/version/). -\end{comment} - -Any fpage will only be provided to one thread. $\sigma_0$ will return -an error if another thread attempts to map or manipulate an fpage that -has already been given to some other thread, even if both threads -reside in the same address space. - - -\subsection{The initial server $\sigma_1$} - -$\sigma_1$ is intended to provide a paging service for UTCB memory. -This will allow orthogonal persistence to be implemented. It is not -yet supported. - -The thread ID of $\sigma_1$ is (\verb/UserBase + 1, 1)/. - - -\subsection{The rootserver} -\label{rootserver} - -The rootserver is the only task in the system which threads can -perform privileged system calls. So the rootserver must provide -wrappers for the system calls to other unprivileged system tasks. - -\begin{comment} - For this, a simple authentication scheme is required. The - rootserver can keep a small, statically allocated table of threads - which are granted access to the system call wrappers. The caller - could provide the index in the table for fast O(1) lookup instead - linear search. Threads with access could be allowed to add other - threads or change existing table entries. The same scheme can be - used in the device driver framework. - - The rootserver should have one thread per CPU, and run at a high - priority. -\end{comment} - -The rootserver has the following initial state: - -\begin{itemize} -\item Its thread ID is (\verb/UserBase + 2/, 1). - -\item The priority is set to the 255, the maximum value. - - \begin{comment} - The rootserver, or at least the system call wrapper, should run at - a very high priority. - \end{comment} - -\item The instruction pointer \verb/%eip/ is set to the entry point, -all other registers are undefined (including the stack pointer). - -\item The pager is set to $\sigma_0$. - -\item The exception handler set to \verb/nilthread/. - -\item The scheduler is set to the rootserver thread itself. -\end{itemize} - -So the first thing the rootserver has to do is to set up a simple -stack. - -Then the rootserver should evaluate the \verb/boot_info/ field in the -KIP to find the information about the other modules. It should parse -the information and create the desired initial tasks of the operating -system. The Hurd uses a boot script syntax to allow to pass -information about other initial tasks and the root tasks to each -initial task in a generalized manner. - -\begin{comment} - The exact number and type of initial tasks necessary to boot the - Hurd are not yet known. Chances are that this list includes the - \texttt{task} server, the physical memory server, the device - servers, and the boot filesystem. The boot filesystem might be a - small simple filesystem, which also includes the device drivers - needed to access the real root filesystem. -\end{comment} - - -\subsection{The physical memory server} - -To be written. - -\begin{comment} - In fact, I already have some ideas. Here they are: - - The rootserver copies (or moves) the physical memory server - executable image to the right location in memory, according to its - respective ELF header. It also initializes the BSS section to zero. - - Then it follows the \texttt{exec()} protocol to startup the new - task. This should be done as transparently as possible. All pages - the rootserver provides because of page faults should be granted. - The rootserver waits for the physical memory server to contact the - rootserver thread. Then the following startup protocol is walked - through: - - \begin{enumerate} - \item The physical memory server requests all system memory from the - rootserver. The rootserver maps the memory from $\sigma_0$ and - grants it to the physical memory server. Alternatively, the - physical memory server might get the memory directly from - $\sigma_0$, but it should ask the rootserver for the amount and - location of memory to get. - - \item For each module that has not been used yet, the rootserver - requests a capability in the physical memory server that can be - used to map in pages from the range of memory that the module - occupies. These capabilities should implement the same pager - interface that mappable files implement. - - The idea is that these capabilities can be used in the - \texttt{exec()} protocol to start up the tasks for these modules. - If a module is not a task, the capability can be used to access - the module data by mapping it into the address space like a file. - The physical memory server can even swap out pages that back these - objects on memory pressure. - - So, the physical memory server is in fact a simple filesystem for - these initial tasks, usable only for mapping operations. - - \item The rootserver can then start up the other tasks in the module - list using the normal \texttt{exec()} protocol. - \end{enumerate} - - The result is that all tasks except for the rootserver can be - started like normal Hurd tasks, and can also be swapped out. -\end{comment} - - -\section{Inter-process communication (IPC)} -\label{ipc} - -The Hurd requires a capability system. Capabilities are used to proof -your identity to other servers (authentication), and access -server-side implemented objects like devices, files, directories, -terminals, and other things. The server can use a capability for -whatever it wants. Capabilities provide interfaces. Interfaces can -be invoked by sending messages to the capability. In L4, this means -that a message is sent to a thread in the server providing the -capability, with the identifier for the capability in the message. - -Capabilities are protected objects. Access to a capability needs to -be granted by the server. Once you have a capability, you can copy it -to other tasks (if the server permits it, which is usually the case). -In the Hurd, access to capabilities is always granted to a whole task, -not to individual threads. - -\begin{comment} - There is no reason for the server not to permit it, because the - holder of the capability could also just act as a proxy for the - intended receiver instead copying the capability to it. The - operation might fail anyway, for example because of resource - shortage, in particular if the server puts a quota on the number of - capabilities a user can hold. -\end{comment} - -Capabilities provide two essential services to the Hurd. They are -used to restrict access to a server function, and they are the -standard interface the components in the Hurd use to communicate with -each others. Thus, it is important that their implementation is fast -and secure. - -\begin{comment} - There are several ways to implement such a capability system. A - more traditional design would be a global, trusted capability server - that provides capabilities to all its users. The L4 redirector - could be used to reroute all client traffic automatically through - this server. This approach has several disadvantages: - - \begin{itemize} - \item It adds a lot of overhead to every single RPC, because all - traffic has to be routed through the capability server, which must - then perform the authentication on the server's behalf. - \item It would be difficult to copy a capability to another task. - Either the cap server would have to provide interfaces for clients - to do it, or it would be have to know the message format for every - interface and do it automatically. - \item It would be a single point of failure. If it had a bug and - crashed, the whole system would be affected. - \item Users could not avoid it, it would be enforced system code. - \item It is inflexible. It would be hard to replace or extend at - run-time. - \end{itemize} - - Another approach is taken by CORBA with IORs. IORs contain long - random numbers which allow the server to identify a user of an - object. This approach is not feasible for the following reasons: - - \begin{itemize} - \item Even good random numbers can be guessed. Long enough random - numbers can reduce the likelihood to arbitrary small numbers, - though (below the probability of a hardware failure). - \item Good random numbers are in short supply, and is slow to - generate. Good pseudo random is faster, but it is still difficult - to generate. The random number generator would become a critical - part of the operating system. - \item The random number had to be transfered in every single - message. Because it would have to be long, it would have a - significant negative impact on IPC performance. - \end{itemize} -\end{comment} - -The Hurd implements the capability system locally in each task. A -common default implementation will be shared by all programs. -However, a malicious untrusted program can do nothing to disturb the -communication of other tasks. A capability is identified in the -client task by the server thread and a local identifier (which can be -different from client to client). The server thread will receive -messages for the capabilities. The first argument in the message is -the capability identifier. Although every task can get different IDs -for the same capability, a well-behaving server will give the same ID -to a client which already has a capability and gets the same -capability from another client. So clients can compare capability IDs -from the server numerically to check if two capabilities are the same, -but only if one of the two IDs is received while the client already -had the other one. - -Because access to a capability must be restricted, the server needs to -be careful in only allowing registered and known users to access the -capability. For this, the server must be sure that it can determine -the sender of a message. In L4, this is easy on the surface: The -kernel provides the receiving thread with the sender's thread ID, -which also contains the task ID in the version field. However, the -server must also know for sure if this task is the same task that it -gave access to the capability. Comparing the task IDs numerically is -not good enough, the server must also somehow have knowledge or -influence on how task IDs are reused when tasks die and are created. - -The same is true for the client, of course, which trusts the server -and thus must be sure that it is not tricked into trusting on -unreliable data from an imposter, or sends sensitive data to it. - -\begin{comment} - The \texttt{task} server wants to reuse thread numbers because that - makes best use of kernel memory. Reusing task IDs, the version - field of a thread ID, is not so important, but there are only 14 - bits for the version field (and the lower six bits must not be all - zero). So a thread ID is bound to be reused eventually. - - Using the version field in a thread ID as a generation number is not - good enough, because it is so small. Even on 64-bit architectures, - where it is 32 bit long, it can eventually overflow. -\end{comment} - -The best way to prevent that a task can be tricked into talking to an -imposter is to have the \texttt{task} server notify the task if the -communication partner dies. The \texttt{task} server must guarantee -that the task ID is not reused until all tasks that got such a -notification acknowledge that it is processed, and thus no danger of -confusion exists anymore. - -The \texttt{task} server provides references to task IDs in form of -\emph{task info capabilities}. If a task has a task info capability -for another task, it prevents that this other task's task ID is reused -even if that task dies, and it also makes sure that task death -notifications are delivered in that case. - -\begin{comment} - Because only the \texttt{task} server can create and destroy tasks, - and assign task IDs, there is no need to hold such task info - capabilities for the \texttt{task} server, nor does the - \texttt{task} server need to hold task info capabilities for its - clients. This avoids the obvious bootstrap problem in providing - capabilities in the \texttt{task} server. This will even work if - the \texttt{task} server is not the real \texttt{task} server, but a - proxy task server (see section \ref{proxytaskserver} on page - \pageref{proxytaskserver}). -\end{comment} - -As task IDs are a global resource, care has to be taken that this -approach does not allow for a DoS-attack by exhausting the task ID -number space, see section \ref{taskinfocap} on page -\pageref{taskinfocap} for more details. - - -\subsection{Capabilities} - -This subsection contains implementation details about capabilities. - -A server will usually operate on objects, and not capabilities. In -the case of a filesystem, this could be file objects, for example. - -\begin{comment} - In the Hurd, filesystem servers have to keep different objects for - each time a file is looked up (or ``opened''), because some state, - for example authentication, open flags and record locks, are - associated not with the file directly, but with this instance of - opening the file. Such a state structure (``credential'') will also - contain a pointer and reference to the actual file node. For - simplicity, we will assume that the capability is associated with a - file node directly. -\end{comment} - -To provide access to the object to another task, the server creates a -capability, and associates it with the object (by setting a hook -variable in the capability). From this capability, the server can -either create send references to itself, or to other tasks. If the -server creates send references for itself, it can use the capability -just as it can use capabilities implemented by other servers. This -makes access to locally and remotely implemented capabilities -identical. If you write code to work on capabilities, it can be used -for remote objects as well as for local objects. - -If the server creates a send reference for another task (a client), a -new capability ID will be created for this task. This ID will only be -valid for this task, and should be returned to the client. - -The client itself will create a capability object from this capability -ID. The capability will also contain information about the server, -for example the server thread which should be used for sending -messages to the capability. - -If the client wants to send a message, it will send it to the provided -server thread, and use the capability ID it got from the server as the -first argument in the RPC. The server receives the message, and now -has to look up the capability ID in the list of capabilties for this -task. - -\begin{comment} - The server knows the task ID from the version field of the sender's - thread ID. It can look up the list of capabilities for this task in - a hash table. The capability ID can be an index into an array, so - the server only needs to perform a range check. This allows to - verify quickly that the user is allowed to access the object. - - This is not enough if several systems run in parallel on the same - host. Then the version ID for the threads in the other systems will - not be under the control of the Hurd's \texttt{task} server, and can - thus not be trusted. The server can still use the version field to - find out the task ID, which will be correct \emph{if the thread is - part of the same subsystem}. It also has to verify that the - thread belongs to this subsystem. Hopefully the subsystem will be - encoded in the thread ID. Otherwise, the \texttt{task} server has - to be consulted (and, assuming that thread numbers are not shared by - the different systems, the result can be cached). -\end{comment} - -The server reads out the capability associated with the capability ID, -and invokes the server stub according to the message ID field in the -message. - -After the message is processed, the server sends it reply to the -sender thread with a zero timeout. - -\begin{comment} - Servers must never block on sending messages to clients. Even a - small timeout can be used for DoS-attacks. The client can always - make sure that it receives the reply by using a combined send and - receive operation together with an infinite timeout. -\end{comment} - -The above scheme assumes that the server and the client already have -task info caps for the respective other task. This is the normal -case, because acquiring these task info caps is part of the protocol -that is used when a capability is copied from one task to another. - - -\subsubsection{Bootstrapping a client-server connection} -\label{ipcbootstrap} - -If the client and the server do not know about each other yet, then -they can bootstrap a connection without support from any other task -except the \texttt{task} server. The purpose of the initial handshake -is to give both participants a chance to acquire a task info cap for -the other participants task ID, so they can be sure that from there on -they will always talk to the same task as they talked to before. - -\paragraph{Preconditions} -The client knows the thread ID of the server thread that receives and -processes the bootstrap messages. Some other task might hold a task -info capability to the server the client wants to connect to. - -\begin{comment} - If no such other tasks exists, the protocol will still work. - However, the client might not get a connection to the server that - run at the time the client started the protocol, but rather to the - server that run at the time the client acquired the task info cap - for the server's task ID (after step 1 below). - - This is similar to how sending signals works in Unix: Technically, - at the time you write \texttt{kill 203}, and press enter, you do not - know if the process with the PID 203 you thought of will receive the - signal, or some other process that got the PID in the time between - you getting the information about the PID and writing the - \texttt{kill}-command. -\end{comment} - -FIXME: Here should be the pseudo code for the protocol. For now, you -have to take it out of the long version. - -\begin{enumerate} - -\item The client acquires a task info capability for the server's task - ID, either directly from the \texttt{task} server, or from another - task in a capability copy. From that point on, the client can be - sure to always talk to the same task when talking to the server. - - Of course, if the client already has a task info cap for the server - it does not need to do anything in this step. - -\begin{comment} - As explained above, if the client does not have any other task - holding the task info cap already, it has no secure information - about what this task is for which it got a task info cap. -\end{comment} - -\item The client sends a message to the server, requesting the initial - handshake. - -\item The server receives the message, and acquires a task info cap - for the client task (directly from the \texttt{task} server). - - Of course, if the server already has a task info cap for the client - it does not need to do anything in this step. - -\begin{comment} - At this point, the server knows that future messages from this task - will come from the same task as it got the task info cap for. - However, it does not know that this is the same task that sent the - initial handshake request in step 2 above. This shows that there is - no sense in verifying the task ID or perform any other - authentication before acquiring the task info cap. -\end{comment} - -\item The server replies to the initial handshake request with an - empty reply message. - -\begin{comment} - Because the reply now can go to a different task than the request - came from, sending the reply might fail. It might also succeed and - be accepted by the task that replaced the requestor. Or it might - succeed normally. The important thing is that it does not matter to - the server at all. It would have provided the same ``service'' to - the ``imposter'' of the client, if he had bothered to do the - request. As no authentication is done yet, there is no point for - the server to bother. - - This means however, that the server needs to be careful in not - consuming too many resources for this service. However, this is - easy to achieve. Only one task info cap per client task will ever - be held in the server. The server can either keep it around until - the task dies (and a task death notification is received), or it can - clean it up after some timeout if the client does not follow up and - do some real authentication. -\end{comment} - -\item The client receives the reply message to its initial handshake - request. - -\item The client sends a request to create its initial capability. - How this request looks depends on the type of the server and the - initial capabilities it provides. Here are some examples: - - \begin{itemize} - \item A filesystem might provide an unauthenticated root directory - object in return of the underlying node capability, which is - provided by the parent filesystem and proves to the filesystem - that the user was allowed to look up the root node of this - filesystem (see section \ref{xfslookup} on page - \pageref{xfslookup}). - - \begin{comment} - In this example, the parent filesystem will either provide the - task info cap for the child filesystem to the user, or it will - hold the task info cap while the user is creating their own - (which the user has to verify by repeating the lookup, though). - Again, see section \ref{xfslookup} on page \pageref{xfslookup}. - - The unauthenticated root directory object will then have the be - authenticated using the normal reauthentication mechanism (see - section \ref{auth} on pageref{auth}). This can also be combined - in a single RPC. - \end{comment} - - \item Every process acts as a server that implements the signal - capability for this process. Tasks who want to send a signal to - another task can perform the above handshake, and then provide - some type of authentication capability that indicates that they - are allowed to send a signal. Different authentication - capabilities can be accepted by the signalled task for different - types of signals. - - \begin{comment} - The Hurd used to store the signal capability in the proc server, - where authorized tasks could look it up. This is no longer - possible because a server can not accept capabilities - implemented by untrusted tasks, see below. - \end{comment} - \end{itemize} - -\item The server replies with whatever capability the client - requested, provided that the client could provide the necessary - authentication capabilities, if any. - - \begin{comment} - It is not required that the server performs any authentication at - all, but it is recommended, and all Hurd servers will do so. - - In particular, the server should normally only allow access from - tasks running in the same system, if running multiple systems on - the same host is possible. - \end{comment} -\end{enumerate} - -\paragraph{Result} -The client has a task info capability for the server and an -authenticated capability. The server has a task info capability for -the client and seen some sort of authentication for the capability it -gave to the client. - -\begin{comment} - If you think that the above protocol is complex, you have seen - nothing yet! Read on. -\end{comment} - - -\subsubsection{Returning a capability from a server to a client} - -Before we go on to the more complex case of copying a capability from -one client to another, let us point out that once a client has a -capability from a server, it is easy for the server to return more -capabilities it implements to the client. - -The server just needs to create the capability, acquire a capability -ID in the client's cap ID space, and return the information in the -reply RPC. - -FIXME: Here should be the pseudo code for the protocol. For now, you -have to take it out of the long version. - -\begin{comment} - The main point of this section is to point out that only one task - info capability is required to protect all capabilities provided to - a single task. The protocols described here always assume that no - task info caps are held by anyone (except those mentioned in the - preconditions). In reality, sometimes the required task info caps - will already be held. -\end{comment} - - -\subsubsection{Copying a capability from one client to another task} - -The most complex operation in managing capabilities is to copy or move -a capability from the client to another task, which subsequently -becomes a client of the server providing the capability. The -difficulty here lies in the fact that the protocol should be fast, but -also robust and secure. If any of the participants dies unexpectedly, -or any of the untrusted participants is malicious, the others should -not be harmed. - -\paragraph{Preconditions} -The client $C$ has a capability from server $S$ (this implies that $C$ -has a task info cap for $S$ and $S$ has a task info cap for $C$). It -wants to copy the capability to the destination task $D$. For this, -it will have to make RPCs to $D$, so $C$ has also a capability from -$D$ (this implies that $C$ has a task info cap for $D$ and $D$ has a -task info cap for $C$). Of course, the client $C$ trusts its servers -$S$ and $D$. $D$ might trust $S$ or not, and thus accept or reject -the capability that $C$ wants to give to $D$. $S$ does not trust -either $C$ or $D$. - -The \texttt{task} server is also involved, because it provides the -task info capabilities. Everyone trusts the \texttt{task} server they -use. This does not need to be the same one for every participant. - -FIXME: Here should be the pseudo code for the protocol. For now, you -have to take it out of the long version. - -\begin{enumerate} -\item The client invokes the \verb/cap_ref_cont_create/ RPC on the - capability, providing the task ID of the intended receiver $D$ of - the capability. - -\item The server receives the \verb/cap_ref_cont_create/ RPC from the - client. It requests a task info cap for $D$ from its trusted task - server, under the constraint that $C$ is still living. - - \begin{comment} - A task can provide a constraint when creating a task info cap in - the \texttt{task} server. The constraint is a task ID. The task - server will only create the task info cap and return it if the - task with the constraint task ID is not destroyed. This allows - for a task requesting a task info capability to make sure that - another task, which also holds this task info cap, is not - destroyed. This is important, because if a task is destroyed, all - the task info caps it held are released. - - In this case, the server relies on the client to hold a task info - cap for $D$ until it established its own. See below for what can - go wrong if the server would not provide a constraint and both, - the client and the destination task would die unexpectedly. - \end{comment} - - Now that the server established its own task info cap for $D$, it - creates a reference container for $D$, that has the following - properties: - - \begin{itemize} - \item The reference container has a single new reference for the - capability. - - \item The reference container has an ID that is unique among all - reference container IDs for the client $C$. - - \item The reference container is associated with the client $C$. If - $C$ dies, and the server processes the task death notification for - it, the server will destroy the reference container and release - the capability reference it has (if any). All resources - associated with the reference container will be released. If this - reference container was the only reason for $S$ to hold the task - info cap for $D$, the server will also release the task info cap - for $D$. - - \item The reference container is also associated with the - destination task $D$. If $D$ dies, and the server processes the - task death notification for it, the server will release the - capability reference that is in the reference container (if any). - It will not destroy the part of the container that is associated - with $C$. - \end{itemize} - - The server returns the reference container ID $R$ to the client. - -\item The client receives the reference container ID $R$. - - \begin{comment} - If several capabilities have to be copied in one message, the - above steps need to be repeated for each capability. With - appropriate interfaces, capabilities could be collected so that - only one call per server has to be made. We are assuming here - that only one capability is copied. - \end{comment} - -\item The client sends the server thread ID $T$ and the reference - container ID $R$ to the destination task $D$. - -\item The destination task $D$ receives the server thread ID $T$ and - the reference container ID $R$ from $C$. - - It now inspects the server thread ID $T$, and in particular the task - ID component of it. $D$ has to make the decision if it trusts this - task to be a server for it, or if it does not trust this task. - - If $D$ trusts $C$, it might decide to always trust $T$, too, - irregardless of what task contains $T$. - - If $D$ does not trust $C$, it might be more picky about the task - that contains $T$. This is because $D$ will have to become a client - of $T$, so it will trust it. For example, it will block on messages - it sends to $T$. - - \begin{comment} - If $D$ is a server, it will usually only accept capabilities from - its client that are provided by specific other servers it trusts. - This can be the authentication server, for example (see section - \ref{auth} on page \pageref{auth}). - - Usually, the type of capability that $D$ wants to accept from $C$ - is then further restricted, and only one possible trusted server - implements that type of capabilities. Thus, $D$ can simply - compare the task ID of $T$ with the task ID of its trusted server - (authentication server, ...) to make the decision if it wants to - accept the capability or not. - \end{comment} - - If $D$ does not trust $T$, it replies to $C$ (probably with an error - value indicating why the capability was not accepted). In that - case, jump to step \ref{copycapout}. - - Otherwise, it requests a task info cap for $S$ from its trusted task - server, under the constraint that $C$ is still living. - - Then $D$ sends a \verb/cap_ref_cont_accept/ RPC to the server $S$, - providing the task ID of the client $C$ and the reference container - ID $R$. - -\begin{comment} - \verb/cap_ref_cont_accept/ is one of the few interfaces that is not - sent to a (real) capability, of course. Nevertheless, it is part of - the capability object interface, hence the name. You can think of - it as a static member in the capability class, that does not require - an instance of the class. -\end{comment} - -\item The server receives the \verb/cap_ref_cont_accept/ RPC from the - destination task $D$. It verifies that a reference container exists - with the ID $R$, that is associated with $D$ and $C$. - - \begin{comment} - The server will store the reference container in data structures - associated with $C$, under an ID that is unique but local to $C$. - So $D$ needs to provide both information, the task ID and the - reference container ID of $C$. - \end{comment} - - If that is the case, it takes the reference from the reference - container, and creates a capability ID for $D$ from it. The - capability ID for $D$ is returned in the reply message. - - From that moment on, the reference container is deassociated from - $D$. It is still associated with $C$, but it does not contain any - reference for the capability. - - \begin{comment} - It is not deassociated from $C$ and removed completely, so that - its ID $R$ (or at least the part of it that is used for $C$) is - not reused. $C$ must explicitely destroy the reference container - anyway because $D$ might die unexpectedly or return an error that - gives no indication if it accepted the reference or not. - \end{comment} - -\item The destination task $D$ receives the capability ID and enters - it into its capability system. It sends a reply message to $C$. - - \begin{comment} - If the only purpose of the RPC was to copy the capability, the - reply message can be empty. Usually, capabilities will be - transfered as part of a larger operation, though, and more work - will be done by $D$ before returning to $C$. - \end{comment} - -\item \label{copycapout} The client $C$ receives the reply from $D$. - Irregardless if it indicated failure or success, it will now send - the \verb/cap_ref_cont_destroy/ message to the server $S$, providing - the reference container $R$. - - \begin{comment} - This message can be a simple message. It does not require a reply - from the server. - \end{comment} - -\item The server receives the \verb/cap_ref_cont_destroy/ message and - removes the reference container $R$. The reference container is - deassociated from $C$ and $D$. If this was the only reason that $S$ - held a task info cap for $D$, this task info cap is also released. - - \begin{comment} - Because the reference container can not be deassociated from $C$ - by any other means than this interface, the client does not need - to provide $D$. $R$ can not be reused without the client $C$ - having it destroyed first. This is different from the - \verb/cap_ref_cont_accept/ call made by $D$, see above. - \end{comment} - -\end{enumerate} - -\paragraph{Result} -For the client $C$, nothing has changed. The destination task $D$ -either did not accept the capability, and nothing has changed for it, -and also not for the server $S$. Or $D$ accepted the capability, and -it now has a task info cap for $S$ and a reference to the capability -provided by $S$. In this case, the server $S$ has a task info cap for -$D$ and provides a capability ID for this task. - -The above protocol is for copying a capability from $C$ to $D$. If -the goal was to move the capability, then $C$ can now release its -reference to it. - -\begin{comment} - Originally we considered to move capabilities by default, and - require the client to acquire an additional reference if it wanted - to copy it instead. However, it turned out that for the - implementation, copying is easier to handle. One reason is that the - client usually will use local reference counting for the - capabilities it holds, and with local reference counting, one - server-side reference is shared by many local references. In that - case, you would need to acquire a new server-side reference even if - you want to move the capability. The other reason is cancellation. - If an RPC is cancelled, and you want to back out of it, you need to - restore the original situation. And that is easier if you do not - change the original situation in the first place until the natural - ``point of no return''. -\end{comment} - -The above protocol quite obviously achieves the result as described in -the above concluding paragraph. However, many other, and often -simpler, protocols would also do that. The other protocols we looked -at are not secure or robust though, or require more operations. To -date we think that the above is the shortest (in particular in number -of IPC operations) protocol that is also secure and robust (and if it -is not we think it can be fixed to be secure and robust with minimal -changes). We have no proof for its correctness. Our confidence comes -from the scrutiny we applied to it. If you find a problem with the -above protocol, or if you can prove various aspects of it, we would -like to hear about it. - -To understand why the protocol is laid out as it is, and why it is a -secure and robust protocol, one has to understand what could possibly -go wrong and why it does not cause any problems for any participant if -it follows its part of the protocol (independent on what the other -participants do). In the following paragraphs, various scenarios are -suggested where things do not go as expected in the above protocol. -This is probably not a complete list, but it should come close to it. -If you find any other problematic scenario, again, let us know. - -\begin{comment} - Although some comments like this appear in the protocol description - above, many comments have been spared for the following analysis of - potential problems. Read the analysis carefully, as it provides - important information about how, and more importantly, why it works. -\end{comment} - -\paragraph{The server $S$ dies} -What happens if the server $S$ dies unexpectedly sometime throughout -the protocol? - -\begin{comment} - At any time a task dies, the task info caps it held are released. - Also, task death notifications are sent to any task that holds task - info caps to the now dead task. The task death notifications will - be processed asynchrnouly, so they might be processed immediately, - or at any later time, even much later after the task died! So one - important thing to keep in mind is that the release of task info - caps a task held, and other tasks noticing the task death, are - always some time apart. -\end{comment} - -Because the client $C$ holds a task info cap for $S$ no imposter can -get the task ID of $S$. $C$ and $D$ will get errors when trying to -send messages to $S$. - -\begin{comment} - You might now wonder what happens if $C$ also dies, or if $C$ is - malicious and does not hold the task info cap. You can use this as - an exercise, and try to find the answer on your own. The answers - are below. -\end{comment} - -Eventually, $C$ (and $D$ if it already got the task info cap for $S$) -will process the task death notification and clean up their state. - -\paragraph{The client $C$ dies} -The server $S$ and the destination task $D$ hold a task info cap for -$C$, so no imposter can get its task ID. $S$ and $D$ will get errors -when trying to send messages to $C$. Depending on when $C$ dies, the -capability might be copied successfully or not at all. - -Eventually, $S$ and $D$ will process the task death notification and -release all resources associated with $C$. If the reference was not -yet copied, this will include the reference container associated with -$C$, if any. If the reference was already copied, this will only -include the empty reference container, if any. - -\begin{comment} - Of course, the participants need to use internal locking to protect - the integrity of their internal data structures. The above protocol - does not show where locks are required. In the few cases where some - actions must be performed atomically, a wording is used that - suggests that. -\end{comment} - -\paragraph{The destination task $D$ dies} - -The client $C$ holds a task info cap for $D$ over the whole operation, -so no imposter can get its task ID. Depending on when $D$ dies, it -has either not yet accepted the capability, then $C$ will clean up by -destroying the reference container, or it has, and then $S$ will clean -up its state when it processes the task death notification for $D$. - -\paragraph{The client $C$ and the destination task $D$ die} - -This scenario is the reason why the server acquires its own task info -cap for $D$ so early, and why it must do that under the constraint -that $C$ still lives. If $C$ and $D$ die before the server created -the reference container, then either no request was made, or creating -the task info cap for $D$ fails because of the constraint. If $C$ and -$D$ die afterwards, then no imposter can get the task ID of $D$ and -try to get at the reference in the container, because the server has -its own task info cap for $D$. - -\begin{comment} - This problem was identified very late in the development of this - protocol. We just did not think of both clients dieing at the same - time! In an earlier version of the protocol, the server would - acquire its task info cap when $D$ accepts its reference. This is - too late: If $C$ and $D$ die just before that, an imposter with - $D$'s task ID can try to get the reference in the container before - the server processes the task death notification for $C$ and - destroys it. -\end{comment} - -Eventually, the server will receive and process the task death -notifications. If it processes the task death notification for $C$ -first, it will destroy the whole container immediately, including the -reference, if any. If it processes the task death notification for -$D$ first, it will destroy the reference, and leave behind the empty -container associated with $C$, until the other task death notification -is processed. Either way no imposter can get at the capability. - -Of course, if the capability was already copied at the time $C$ and -$D$ die, the server will just do the normal cleanup. - -\paragraph{The client $C$ and the server $S$ die} - -This scenario does not cause any problems, because on the one hand, -the destination task $D$ holds a task info cap for $C$, and it -acquires its own task info cap for $S$. Although it does this quite -late in the protocol, it does so under the constraint that $C$ still -lives, which has a task info cap for $S$ for the whole time (until it -dies). It also gets the task info cap for $S$ before sending any -message to it. An imposter with the task ID of $S$, which it was -possible to get because $C$ died early, would not receive any message -from $D$ because $D$ uses $C$ as its constraint in acquireing the task -info cap for $S$. - -\paragraph{The destination task $D$ and the server $S$ die} - -As $C$ holds task info caps for $S$ and $D$, there is nothing that can -go wrong here. Eventually, the task death notifications are -processed, but the task info caps are not released until the protocol -is completed or aborted because of errors. - -\paragraph{The client $C$, the destination task $D$ and the server $S$ die} - -Before the last one of these dies, you are in one of the scenarios -which already have been covered. After the last one dies, there is -nothing to take care of anymore. - -\begin{comment} - In this case your problem is probably not the capability copy - protocol, but the stability of your software! Go fix some bugs. -\end{comment} - -So far the scenarios where one or more of the participating tasks die -unexpectedly. They could also die purposefully. Other things that -tasks can try to do purposefully to break the protocol are presented -in the following paragraphs. - -\begin{comment} - A task that tries to harm other tasks by not following a protocol - and behaving as other tasks might expect it is malicious. Beside - security concerns, this is also an issue of robustness, because - malicious behaviour can also be triggered by bugs rather than bad - intentions. - - It is difficult to protect against malicious behaviour by trusted - components, like the server $S$, which is trusted by both $C$ and - $D$. If a trusted component is compromised or buggy, ill - consequences for software that trusts it must be expected. Thus, no - analysis is provided for scenarious involving a malicious or buggy - server $S$. -\end{comment} - -\paragraph{The client $C$ is malicious} - -If the client $C$ wants to break the protocol, it has numerous -possibilities to do so. The first thing it can do is to provide a -wrong destination task ID when creating the container. But in this -case, the server will return an error to $D$ when it tries to accept -it, and this will give $D$ a chance to notice the problem and clean -up. This also would allow for some other task to receive the -container, but the client can give the capability to any other task it -wants to anyway, so this is not a problem. - -\begin{comment} - If a malicious behaviour results in an outcome that can also be - achieved following the normal protocol with different parameters, - then this not a problem at all. -\end{comment} - -The client could also try to create a reference container for $D$ and -then not tell $D$ about it. However, a reference container should not -consume a lot of resources in the server, and all such resources -should be attributed to $C$. When $C$ dies eventually, the server -will clean up any such pending containers when the task death -notification is processed. - -The same argument holds when $C$ leaves out the call to -\verb/cap_ref_cont_destroy/. - -The client $C$ could also provide wrong information to $D$. It could -supply a wrong server thread ID $T$. It could supply a wrong -reference container ID $R$. If $D$ does not trust $C$ and expects a -capability implemented by some specific trusted server, it will verify -the thread ID numerically and reject it if it does not match. The -reference container ID will be verified by the server, and it will -only be accepted if the reference container was created by the client -task $C$. Thus, the only wrong reference container IDs that the -client $C$ could use to not provoke an error message from the server -(which then lead $D$ to abort the operation) would be a reference -container that it created itself in the first place. However, $C$ -already is frree to send $D$ any reference container it created. - -\begin{comment} - Again $C$ can not achieve anything it could not achieve by just - following the protocol as well. If $C$ tries to use the same - reference container with several RPCs in $D$, one of them would - succeed and the others would fail, hurting only $C$. - - If $D$ does trust $C$, then it can not protect against malicious - behaviour by $C$. -\end{comment} - -To summarize the result so far: $C$ can provide wrong data in the -operations it does, but it can not achieve anything this way that it -could not achieve by just following the protocol. In most cases the -operation would just fail. If it leaves out some operations, trying -to provoke resource leaks in the server, it will only hurt itself (as -the reference container is strictly associated with $C$ until the -reference is accepted by $D$). - -\begin{comment} - For optimum performance, the server should be able to keep the - information about the capabilities and reference containers a client - holds on memory that is allocated on the clients behalf. - - It might also use some type of quota system. -\end{comment} - -Another attack that $C$ can attempt is to deny a service that $S$ and -$D$ are expecting of it. Beside not doing one or more of the RPCs, -this is in particular holding the task info caps for the time span as -described in the protocol. Of course, this can only be potentially -dangerous in combination with a task death. If $C$ does not hold the -server task info capability, then an imposter of $S$ could trick $D$ -into using the imposter as the server. However, this is only possible -if $D$ already trusts $C$. Otherwise it would only allow servers that -it already trusts, and it would always hold task info caps to such -trusted servers when making the decision that it trusts them. -However, if $D$ trusts $C$, it can not protect against $C$ being -malicious. - -\begin{comment} - If $D$ does not trust $C$, it should only ever compare the task ID - of the server thread against trusted servers it has a task info cap - for. It must not rely on $C$ doing that for $D$. - - However, if $D$ does trust $C$, it can rely on $C$ holding the - server task info cap until it got its own. Thus, the task ID of $C$ - can be used as the constraint when acquiring the task info cap in - the protocol. -\end{comment} - -If $C$ does not hold the task info cap of $D$, and $D$ dies before the -server acquires its task info cap for $D$, it might get a task info -cap for an imposter of $D$. But if the client wants to achieve that, -it could just follow the protocol with the imposter as the destination -task. - -\paragraph{The destination task $D$ is malicious} - -The destination task has not as many possibilities as $C$ to attack -the protocol. This is because it is trusted by $C$. So the only -participant that $D$ can try to attack is the server $S$. But the -server $S$ does not rely on any action by $D$. $D$ does not hold any -task info caps for $S$. The only operation it does is an RPC to $S$ -accepting the capability, and if it omits that it will just not get -the capability (the reference will be cleaned up by $C$ or by the -server when $C$ dies). - -The only thing that $D$ could try is to provide false information in -the \verb/cap_ref_cont_accept/ RPC. The information in that RPC is -the task ID of the client $C$ and the reference container ID $R$. The -server will verify that the client $C$ has previously created a -reference container with the ID $R$ that is destined for $D$. So $D$ -will only be able to accept references that it is granted access to. -So it can not achieve anything that it could not achieve by following -the protocol (possibly the protocol with another client). If $D$ -accepts capabilities from other transactions outside of the protocol, -it can only cause other transactions in its own task to fail. - -\begin{comment} - If you can do something wrong and harm yourself that way, then this - is called ``shooting yourself in your foot''. - - The destination task $D$ is welcome to shoot itself in its foot. -\end{comment} - -\paragraph{The client $C$ and the destination task $D$ are malicious} - -The final question we want to raise is what can happen if the client -$C$ and the destination task $D$ are malicious. Can $C$ and $D$ -cooperate and attacking $S$ in a way that $C$ or $D$ alone could not? - -In the above analysis, there is no place where we assume any specific -behaviour of $D$ to help $S$ in preventing an attack on $S$. There is -only one place where we make an assumption for $C$ in the analysis of -a malicious $D$. If $D$ does not accept a reference container, we -said that $C$ would clean it up by calling -\verb/cap_ref_cont_destroy/. So we have to look at what would happen -if $C$ were not to do that. - -Luckily, we covered this case already. It is identical to the case -where $C$ does not even tell $D$ about the reference container and -just do nothing. In this case, as said before, the server will -eventually release the reference container when $C$ dies. Before -that, it only occupies resources in the server that are associated -with $C$. - -This analysis is sketchy in parts, but it covers a broad range of -possible attacks. For example, all possible and relevant combinations -of task deaths and malicious tasks are covered. Although by no means -complete, it can give us some confidence about the rightness of the -protocol. It also provides a good set of test cases that you can test -your own protocols, and improvements to the above protocol against. - - -\subsubsection{The trust rule} - -The protocol to copy a capability from one client to another task has -a dramatic consequence on the design of the Hurd interfaces. - -Because the receiver of the capability must make blocking calls to the -server providing the capability, the receiver of the capability -\emph{must} trust the server providing the capability. - -This means also: If the receiver of a capability does not trust the -server providing the capability, it \emph{must not} accept it. - -The consequence is that normally, servers can not accept capabilities -from clients, unless they are provided by a specific trusted server. -This can be the \texttt{task} or \texttt{auth} server for example. - -This rule is even true if the receiver does not actually want to use -the capability for anything. Just accepting the capability requires -trusting the server providing it already. - -In the Hurd on Mach, ports (which are analogous to capabilities in -this context) can be passed around freely. There is no security risk -in accepting a port from any source, because the kernel implements -them as protected objects. Using a port by sending blocking messages -to it requires trust, but simply storing the port on the server side -does not. - -This is different in the Hurd on L4: A server must not accept -capabilities unless it trusts the server providing them. Because -capabilities are used for many different purposes (remote objects, -authentication, identification), one has to be very careful in -designing the interfaces. The Hurd interfaces on Mach use ports in a -way that is not possible on L4. Such interfaces need to be -redesigned. - -Often, redesigning such an interface also fixes some other security -problems that exists with in the Hurd on L4, in particular DoS -attacks. A good part of this paper is about redesigning the Hurd to -avoid storing untrusted capabilities on the server side. - -\begin{comment} - Examples are: - - \begin{itemize} - \item The new authentication protocol, which eliminates the need for - a rendezvous port and is not only faster, but also does not - require the server to block on the client anymore (see section - \ref{auth} on page \pageref{auth}). - - \item The signal handling, which does not require the \texttt{proc} - server to hold the signal port for every task anymore (see section - \ref{signals} on page \pageref{signals}). - - \item The new exec protocol, which eliminates the need to pass all - capabilities that need to be transfered to the new executable from - the old program to the filesystem server, and then to the - \texttt{exec} server (see section \ref{exec} on page - \pageref{exec}). - - \item The new way to implement Unix Domain Sockets, which don't - require a trusted system server, so that descriptor passing (which - is really capability passing) can work (see section - \ref{unixdomainsockets} on page \pageref{unixdomainsockets}. - - \item The way parent and child filesystem are linked to each other, - in other words: how mounting a filesystem works (see section - \ref{xfslookup} on page \pageref{xfslookup}). - - \item The replacement for the \verb/file_reparent()/ RPC (see - section \ref{reparenting} on page \pageref{reparenting}). - \end{itemize} -\end{comment} - -\subsection{Synchronous IPC} - -The Hurd only needs synchronous IPC. Asynchronous IPC is usually not -required. An exception are notifications (see below). - -There are possibly some places in the Hurd source code where -asynchronous IPC is assumed. These must be replaced with different -strategies. One example is the implementation of select() in the GNU -C library. - -\begin{comment} - A naive implementation would use one thread per capability to select - on. A better one would combine all capabilities implemented by the - same server in one array and use one thread per server. - - A more complex scheme might let the server process select() calls - asynchronously and report the result back via notifications. -\end{comment} - -In other cases the Hurd receives the reply asynchronously from sending -the message. This works fine in Mach, because send-once rights are -used as reply ports and Mach guarantees to deliver the reply message, -ignoring the kernel queue limit. In L4, no messages are queued and -such places need to be rewritten in a different way (for example using -extra threads). - -\begin{comment} - What happens if a client does not go into the receive phase after a - send, but instead does another send, and another one, quickly many - sends, as fast as possible? A carelessly written server might - create worker threads for each request. Instead, the server should - probably reject to accept a request from a client thread that - already has a pending request, so the number of worker threads is - limited to the number of client threads. - - This also makes interrupting an RPC operation easier (the client - thread ID can be used to identify the request to interrupt). -\end{comment} - - -\subsection{Notifications} - -Notifications to untrusted tasks happen frequently. One case is -object death notifications, in particular task death notifications. -Other cases might be select() or notifications of changes to the -filesystem. - -The console uses notifications to broadcast change events to the -console content, but it also uses shared memory to broadcast the -actual data, so not all notifications need to be received for -functional operation. Still, at least one notification is queued by -Mach, and this is sufficient for the console to wakeup whenever -changes happened, even if the changes can not be processed -immediately. - -From the servers point of view, notifications are simply messages with -a send and xfer timeout of 0 and without a receive phase. - -For the client, however, there is only one way to ensure that it will -receive the notification: It must have the receiving thread in the -receive phase of an IPC. While this thread is processing the -notification (even if it is only delegating it), it might be preempted -and another (or the same) server might try to send a second -notification. - -\begin{comment} - It is an open challenge how the client can ensure that it either - receives the notification or at least knows that it missed it, while - the server remains save from potential DoS attacks. The usual - strategy, to give receivers of notifications a higher scheduling - priority than the sender, is not usable in a system with untrusted - receivers (like the Hurd). The best strategy determined so far is - to have the servers retry to send the notification several times - with small delays inbetween. This can increase the chance that a - client is able to receive the notification. However, there is still - the question what a server can do if the client is not ready. - - An alternative might be a global trusted notification server that - runs at a higher scheduling priority and records which servers have - notifications for which clients, and that can be used by clients to - be notified of pending notifications. Then the clients can poll the - notifications from the servers. -\end{comment} - - -\section{Threads and Tasks} - -The \texttt{task} server will provide the ability to create tasks and -threads, and to destroy them. - -\begin{comment} - In L4, only threads in the privileged address space (the rootserver) - are allowed to manipulate threads and address spaces (using the - \textsc{ThreadControl} and \textsc{SpaceControl} system calls). The - \texttt{task} server will use the system call wrappers provided by - the rootserver, see section \ref{rootserver} on page - \pageref{rootserver}. -\end{comment} - -The \texttt{task} server provides three different capability types. - -\paragraph{Task control capabilities} -If a new task is created, it is always associated with a task control -capability. The task control capability can be used to create and -destroy threads in the task, and destroy the task itself. So the task -control capability gives the owner of a task control over it. Task -control capabilities have the side effect that the task ID of this -task is not reused, as long as the task control capability is not -released. Thus, having a task control capability affects the global -namespace of task IDs. If a task is destroyed, task death -notifications are sent to holders of task control capabilities for -that task. - -\begin{comment} - A task is also implicitely destroyed when the last task control - capability reference is released. -\end{comment} - -\paragraph{Task info capabilities} -\label{taskinfocap} -Any task can create task info capabilities for other tasks. Such task -info capabilities are used mainly in the IPC system (see section -\ref{ipc} on page \pageref{ipc}). Task info capabilities have the -side effect that the task ID of this task is not reused, as long as -the task info capability is not released. Thus, having a task info -capability affects the global namespace of task IDs. If a task is -destroyed, task death notifications are sent to holders of task info -capabilities for that task. - -\begin{comment} - Because of that, holding task info capabilities must be restricted - somehow. Several strategies can be taken: - - \begin{itemize} - \item Task death notifications can be monitored. If there is no - acknowdgement within a certain time period, the \texttt{task} - server could be allowed to reuse the task ID anyway. This is not - a good strategy because it can considerably weaken the security of - the system (capabilities might be leaked to tasks which reuse such - a task ID reclaimed by force). - \item The proc server can show dead task IDs which are not released - yet, in analogy to the zombie processes in Unix. It can also make - available the list of tasks which prevent reusing the task ID, to - allow users or the system administrator to clean up manually. - \item Quotas can be used to punish users which do not acknowledge - task death timely. For example, if the number of tasks the user - is allowed to create is restricted, the task info caps that the - user holds for dead tasks could be counted toward that limit. - \item Any task could be restricted to as many task ID references as - there are live tasks in the system, plus some slack. That would - prevent the task from creating new task info caps if it does not - release old ones from death tasks. The slack would be provided to - not unnecessarily slow down a task that processes task death - notifications asynchronously to making connections with new tasks. - \end{itemize} - - In particular the last two approaches should proof to be effective - in providing an incentive for tasks to release task info caps they - do not need anymore. -\end{comment} - -\paragraph{Task manager capability} -A task is a relatively simple object, compared to a full blown POSIX -process, for example. As the \texttt{task} server is enforced system -code, the Hurd does not impose POSIX process semantics in the task -server. Instead, POSIX process semantics are implemented in a -different server, the proc server (see also section \ref{proc} on page -\pageref{proc}). To allow the \texttt{proc} server to do its work, it -needs to be able to get the task control capability for any task, and -gather other statistics about them. Furthermore, there must be the -possibility to install quota mechanisms and other monitoring systems. -The \texttt{task} server provides a task manager capability, that -allows the holder of that capability to control the behaviour of the -\texttt{task} server and get access to the information and objects it -provides. - -\begin{comment} - For example, the task manager capability could be used to install a - policy capability that is used by the \texttt{task} server to make - upcalls to a policy server whenever a new task or thread is created. - The policy server could then indicate if the creation of the task or - thread is allowed by that user. For this to work, the \texttt{task} - server itself does not need to know about the concept of a user, or - the policies that the policy server implements. - - Now that I am writing this, I realize that without any further - support by the \texttt{task} server, the policy server would be - restricted to the task and thread ID of the caller (or rather the - task control capability used) to make its decision. A more - capability oriented approach would then not be possible. This - requires more thought. - - The whole task manager interface is not written yet. -\end{comment} - -When creating a new task, the \texttt{task} server allocates a new -task ID for it. The task ID will be used as the version field of the -thread ID of all threads created in the task. This allows the -recipient of a message to verify the sender's task ID efficiently and -easily. - -\begin{comment} - The version field is 14 bit on 32-bit architectures, and 32 bit on - 64 bit architectures. Because the lower six bits must not be all - zero (to make global thread IDs different from local thread IDs), - the number of available task IDs is $2^{14} - 2^6$ resp. $2^{32} - - 2^6$. - - If several systems are running in parallel on the same host, they - might share thread IDs by encoding the system ID in the upper bits - of the thread number. -\end{comment} - -Task IDs will be reused only if there are no task control or info -capabilities for that task ID held by any task in the system. To -support bootstrapping an IPC connection (see section -\ref{ipcbootstrap} on page \pageref{ipcbootstrap}), the \texttt{task} -server will delay reusing a task ID as long as possible. - -\begin{comment} - This is similar to how PIDs are generated in Unix. Although it is - attempted to keep PIDs small for ease of use, PIDs are not reused - immediately. Instead, the PID is incremented up to a certain - maximum number, and only then smaller PID values are reused again. - - As task IDs are not a user interface, there is no need to keep them - small. The whole available range can be used to delay reusing a - task ID as long as possible. -\end{comment} - -When creating a new task, the \texttt{task} server also has to create -the initial thread. This thread will be inactive. Once the creation -and activation of the initial thread has been requested by the user, -it will be activated. When the user requests to destroy the last -thread in a task, the \texttt{task} server makes that thread inactive -again. - -\begin{comment} - In L4, an address space can only be implicitely created (resp. - destroyed) with the first (resp. last) thread in that address space. -\end{comment} - -Some operations, like starting and stopping threads in a task, can not -be supported by the task server, but have to be implemented locally in -each task because of the minimality of L4. If external control over -the threads in a task at this level is required, the debugger -interface might be used (see section \ref{debug} on page -\pageref{debug}). - - -\subsection{Accounting} - -We want to allow the users of the system to use the \texttt{task} -server directly, and ignore other task management facilities like the -\texttt{proc} server. However, the system administrator still needs -to be able to identify the user who created such anonymous tasks. - -For this, a simple accounting mechanism is provided by the task -server. An identifier can be set for a task by the task manager -capability, which is inherited at task creation time from the parent -task. This accounting ID can not be changed without the task manager -capability. - -The \texttt{proc} server sets the accounting ID to the process ID -(PID) of the task whenever a task registers itself with the -\texttt{proc} server. This means that all tasks which do not register -themself with the \texttt{proc} server will be grouped together with -the first parent task that did. This allows to easily kill all -unregistered tasks together with its registered parent. - -The \texttt{task} server does not interpret or use the accounting ID -in any way. - - -\subsection{Proxy Task Server} -\label{proxytaskserver} - -The \texttt{task} server can be safely proxied, and the users of such -a proxy task server can use it like the real \texttt{task} server, -even though capabilities work a bit differently for the \texttt{task} -server than for other servers. - -The problem exists because the proxy task server would hold the real -task info capabilities for the task info capabilities that it provides -to the proxied task. So if the proxy task server dies, all such task -info capabilities would be released, and the tasks using the proxy -task server would become insecure and open to attacks by imposters. - -However, this is not really a problem, because the proxy task server -will also provide proxy objects for all task control capabilities. So -it will be the only task which holds task control capabilities for the -tasks that use it. When the proxy task server dies, all tasks that -were created with it will be destroyed when these tak control -capabilities are released. The proxy task server is a vital system -component for the tasks that use it, just as the real \texttt{task} -server is a vital system component for the whole system. - - -\subsection{Scheduling} - -The task server is the natural place to implement a simple, initial -scheduler for the Hurd. A first version can at least collect some -information about the cpu time of a task and its threads. Later a -proper scheduler has to be written that also has SMP support. - -The scheduler should run at a higher priority than normal threads. - -\begin{comment} - This might require that the whole task server must run at a higher - priority, which makes sense anyway. - - Not much thought has been given to the scheduler so far. This is - work that still needs to be done. -\end{comment} - -There is no way to get at the ``system time'' in L4, it is assumed -that no time is spent in the kernel (which is mostly true). So system -time will always be reported as $0.00$, or $0.01$. - - -\section{Virtual Memory Management} - -Traditionally, monolithical kernels, but even kernels like Mach, -provide a virtual memory management system in the kernel. All paging -decisions are made by the kernel itself. This requires good -heuristics. Smart paging decisions are often not possible because the -kernel lacks the information about how the data is used. - -In the Hurd, paging will be done locally in each task. A physical -memory server provides a number of guaranteed physical pages to tasks. -It will also provide a number of excess pages (over-commit). The task -might have to return any number of excess pages on short notice. If -the task does not comply, all mappings are revoked (essentially -killing the task). - -A problem arises when data has to be exchanged between a client and a -server, and the server wants to have control over the content of the -pages (for example, pass it on to other servers, like device drivers). -The client can not map the pages directly into the servers address -space, as it is not trusted. Container objects created in the -physical memory server and mapped into the client and/or the servers -address space will provide the necessary security features to allow -this. This can be used for DMA and zero-copying in the data exchange -between device drivers and (untrusted) user tasks. - - -\section{Authentication} -\label{auth} - -Capabilities are a good way to give access to protected objects and -services. They are flexible, lightweight and generic. However, Unix -traditionally uses access control lists (ACL) to restrict access to -objects like files. Any task running with a certain user ID can -access all files that are readable for the user with that user ID. -Although all objects are implemented as capabilities in the Hurd, the -Hurd also supports the use of user IDs for access control. - -The system authentication server \texttt{auth} implements the Unix -authentication scheme using capabilities. It provides auth -capabilities, which are associated with a list of effective and -available user and group IDs. The holder of such a capability can use -it to authenticate itself to other servers, using the protocol below. - -Of course, these other servers must use (and trust) the same -\texttt{auth} server as the user. Otherwise, the authentication will -fail. Once a capability is authenticated in the server, the server -will know the user IDs of the client, and can use them to validate -further operations. - -The \texttt{auth} server provides two types of capabilities: - -\paragraph{Auth capabilities} -An auth capability is associated with four vectors of IDs: The -effective user and group IDs, which should be used by other servers to -authenticate operations that require certain user or group IDs, and -the available user and group IDs. Available IDs should not be used -for authentication purposes, but can be turned into effective IDs by -the holder of an auth capability at any time. - -New auth capabilities can be created from existing auth capabilities, -but only if the requested IDs are a subsets from the union of the -(effective and available) IDs in the provided auth capabilities. If -an auth capability has an effective or available user ID 0, then -arbitrary new auth objects can be created from that. - -\paragraph{Passport capabilities} -A passport capability can be created from an auth capability and is -only valid for the task that created it. It can be provided to a -server in an authentication process (see below). For the client, the -passport capability does not directly implement any useful operation. -For the server, it can be used to verify the identity of a user and -read out the effective user and group IDs. - -The auth server should always create new passport objects for -different tasks, even if the underlying auth object is the same, so -that a task having the passport capability can not spy on other tasks -unless they were given the passport capability by that task. - -\subsection{Authenticating a client to a server} - -A client can authenticate itself to a server with the following -protocol: - -\paragraph{Preconditions} -The client $C$ has an auth capability implemented by the \texttt{auth} -server $A$. It also has a capability implemented by the server $S$. -It wants to reauthenticate this capability with the auth capability, -so the server associates the new user and group IDs with it. - -The server also has an auth capability implemented by its trusted -\texttt{auth} server. For the reauthentication to succeed, the -\texttt{auth} server of the client and the server must be identical. -If this is the case, the participating tasks hold task info caps for -all other participating tasks (because of the capabilities they hold). - -\begin{enumerate} -\item The client $C$ requests the passport capability for itself from - the auth capability from $A$. - - \begin{comment} - Normally, the client will request the passport capability only - once and store it together with the auth capability. - \end{comment} - -\item The \texttt{auth} server receives the request and creates a new - passport capability for this auth capability and this client. The - passport capability is returned to the user. - -\item The user receives the reply from the \texttt{auth} server. - - It then sends the reauthentication request to the server $S$, which - is invoked on the capability the client wants to reauthenticate. It - provides the passport capability as an argument. - -\item The server $S$ can accept the passport capability, if it - verifies that it is really implemented by the \texttt{auth} server - it trusts. If the client does not provide a passport capability to - the trusted \texttt{auth} server, the authentication process is - aborted with an error. - - Now the server can send a request to the \texttt{auth} server to - validate the passport capability. The RPC is invoked on the - passport capability. - -\item The \texttt{auth} server receives the validation request on the - passport capability and returns the task ID of the client $C$ that - this passport belongs to, and the effective user and group IDs for - the auth cap to which this passport cap belongs. - - \begin{comment} - The Hurd on Mach returned the available IDs as well. This feature - is not used anywhere in the Hurd, and as the available IDs should - not be used for authentication anyway, this does not seem to be - useful. If it is needed, it can be added in an extended version - of the validation RPC. - \end{comment} - -\item The server receives the task ID and the effective user and group - IDs. The server now verifies that the task ID is the same as the - task ID of the sender of the reauthentication request. Only then - was the reauthentication request made by the owner of the auth cap. - It can then return a new capability authenticated with the new user - and group IDs. - - \begin{comment} - The verification of the client's task ID is necessary. As the - passport cap is copied to other tasks, it can not serve as a proof - of identity alone. It is of course absolutely crucial that the - server holds the task info cap for the client task $C$ for the - whole time of the protocol. But the same is actually true for any - RPC, as the server needs to be sure that the reply message is sent - to the sender thread (and not any imposter). - \end{comment} - -\item The client receives the reply with the new, reauthenticated - capability. Usually this capability is associated in the server - with the same abstract object, but different user credentials. - - \begin{comment} - Of course a new capability must be created. Otherwise, all other - users holding the same capability would be affected as well. - \end{comment} - - The client can now deallocate the passport cap. - - \begin{comment} - As said before, normally the passport cap is cached by the client - for other reauthentications. - \end{comment} -\end{enumerate} - -\paragraph{Result} -The client $C$ has a new capability that is authenticated with the new -effective user and group IDs. The server has obtained the effective -user and group IDs from the \texttt{auth} server it trusts. - -\begin{comment} - The Hurd on Mach uses a different protocol, which is more complex - and is vulnerable to DoS attacks. The above protocol can not - readily be used on Mach, because the sender task of a message can - not be easily identified. -\end{comment} - - -\section{The POSIX personality} - -The Hurd offers a POSIX API to the user by default. This is -implemented in the GNU C library which uses the services provided by -the Hurd servers. Several system servers support the C library. - - -\subsection{Process Management} -\label{proc} - -The \texttt{proc} server implements Unix process semantics in the Hurd -system. It will also assign a PID to each task that was created with -the \texttt{task} server, so that the owner of these tasks, and the -system administrator, can at least send the \verb/SIGKILL/ signal to -them. - -The \texttt{proc} server uses the task manager capability from the -\texttt{task} server to get hold of the information about all tasks -and the task control caps. - -\begin{comment} - The \texttt{proc} server might also be the natural place to - implement a first policy server for the \texttt{task} server. -\end{comment} - - -\subsubsection{Signals} -\label{signals} - -Each process can register the thread ID of a signal thread with the -\texttt{proc} server. The proc server will give the signal thread ID -to any other task which asks for it. - -\begin{comment} - The thread ID can be guessed, so there is no point in protecting it. -\end{comment} - -The signal thread ID can then be used by a task to contact the task to -which it wants to send a signal. The task must bootstrap its -connection with the intended receiver of the signal, according to the -protocol described in section \ref{ipcbootstrap} on page -\pageref{ipcbootstrap}. As a result, it will receive the signal -capability of the receiving task. - -The sender of a signal must then provide some capability that proves -that the sender is allowed to send the signal when a signal is posted -to the signal capability. For example, the owner of the task control -cap is usually allowed to send any signal to it. Other capabilities -might only give permission to send some types of signals. - -\begin{comment} - The receiver of the signal decides itself which signals to accept - from which other tasks. The default implementation in the C library - provides POSIX semantics, plus some extensions. -\end{comment} - -Signal handling is thus completely implemented locally in each task. -The \texttt{proc} server only serves as a name-server for the thread -IDs of the signal threads. - -\begin{comment} - The \texttt{proc} server can not hold the signal capability itself, - as it used to do in the implementation on Mach, as it does not trust - the tasks implementing the capability. But this is not a problem, - as the sender and receiver of a signal can negotiate and bootstrap - the connection without any further support by the \texttt{proc} - server. - - Also, the \texttt{proc} server can not even hold task info caps to - support the sender of a signal in bootstrapping the connection. - This means that there is a race between looking up the signal thread - ID from the PID in the \texttt{proc} server and acquiring a task - info cap for the task ID of the signal receiver in the sender. - However, in Unix, there is always a race when sending a signal using - \verb/kill/. The task server helps the users a bit here by not - reusing task IDs as long as possible. -\end{comment} - -Some signals are not implemented by sending a message to the task. -\verb/SIGKILL/ for example destroys the tasks without contacting it at -all. This feature is implemented in the \texttt{proc} server. - -The signal capability is also used for other things, like the message -interface (which allows you to manipulate the environment variables -and \texttt{auth} capability of a running task, etc). - - -\subsubsection{The \texttt{fork()} function} - -To be written. - - -\subsubsection{The \texttt{exec()} function} -\label{exec} - -The \texttt{exec()} operation will be done locally in a task. -Traditionally, \texttt{exec()} overlays the same task with a new -process image, because creating a new task and transferring the -associated state is expensive. In L4, only the threads and virtual -memory mappings are actually kernel state associated with a task, and -exactly those have to be destroyed by \texttt{exec()} anyway. There -is a lot of Hurd specific state associated with a task (capabilities, -for example), but it is difficult to preserve that. There are -security concerns, because POSIX programs do not know about Hurd -features like capabilities, so inheriting all capabilities across -\texttt{exec()} unconditionally seems dangerous. - -\begin{comment} - One could think that if a program is not Hurd-aware, then it will - not make any use of capabilities except through the normal POSIX - API, and thus there are no capabilities except those that the GNU C - library uses itself, which \texttt{exec()} can take care of. - However, this is only true if code that is not Hurd-aware is never - mixed with Hurd specific code, even libraries (unless the library - intimately cooperates with the GNU C library). This would be a high - barrier to enable Hurd features in otherwise portable programs and - libraries. - - It is better to make all POSIX functions safe by default and allow - for extensions to let the user specify which capabilities besides - those used for file descriptors etc to be inherited by the new - executable. - - For \verb/posix_spawn()/, this is straight-forward. For - \texttt{exec()}, it is not. either specific capabilities could be - markes as ``do not close on \texttt{exec()}'', or variants of the - \texttt{exec()} function could be provided which take further - arguments. -\end{comment} - -There are also implementation obstacles hindering the reuse of the -existing task. Only local threads can manipulate the virtual memory -mappings, and there is a lot of local state that has to be kept -somewhere between the time the old program becomes defunct and the new -binary image is installed and used (not to speak of the actual program -snippet that runs during the transition). - -So the decision was made to always create a new task with -\texttt{exec()}, and copy the desired state from the current task to -the new task. This is a clean solution, because a new task will -always start out without any capabilities in servers, etc, and thus -there is no need for the old task to try to destroy all unneeded -capabilities and other local state before \texttt{exec()}. Also, in -case the exec fails, the old program can continue to run, even if the -exec fails at a very late point (there is no ``point of no return'' -until the new task is actually up and running). - -For suid and sgid applications, the actual \texttt{exec()} has to be -done by the filesystem. However, the filesystem can not be bothered -to also transfer all the user state into the new task. It can not -even do that, because it can not accept capabilities implemented by -untrusted servers from the user. Also, the filesystem does not want -to rely on the new task to be cooperative, because it does not -necessarily trust the code, if is is owned by an untrusted user. - -\begin{enumerate} -\item The user creates a new task and a container with a single - physical page, and makes the \texttt{exec()} call to the file - capability, providing the task control capability. Before that, it - creates a task info capability from it for its own use. -\item The filesystem checks permission and then revokes all other - users on the task control capability. This will revoke the users - access to the task, and will fail if the user did not provide a - pristine task object. (It is assumed that the filesystem should not - create the task itself so the user can not use suid/sgid - applications to escape from their quota restriction). -\item Then it revokes access to the provided physical page and writes - a trusted startup code to it. -\item The filesystem will also prepare all capability transactions and - write the required information (together with other useful - information) in a stack on the physical page. -\item Then it creates a thread in the task, and starts it. At - pagefault, it will provide the physical page. -\item The startup code on the physical page completes the capability - transfer. It will also install a small pager that can install file - mappings for this binary image. Then it jumps to the entry point. -\item The filesystem in the meanwhile has done all it can do to help - the task startup. It will provide the content of the binary or - script via paging or file reads, but that happens asynchronously, - and as for any other task. So the filesystem returns to the client. -\item The client can then send its untrusted information to the new - task. The new task got the client's thread ID from the filesystem - (possibly provided by the client), and thus knows to which thread it - should listen. The new task will not trust this information - ultimatively (ie, the new task will use the authentication, root - directory and other capabilities it got from the filesystem), but it - will accept all capabilities and make proper use of them. -\item Then the new task will send a message to proc to take over the - old PID and other process state. How this can be done best is still - to be determined (likely the old task will provide a process control - capability to the new task). At that moment, the old task is - desrtoyed by the proc server. -\end{enumerate} - -This is a coarse and incomplete description, but it shows the general -idea. The details will depend a lot on the actual implementation. - - -\subsection{Unix Domain Sockets} -\label{unixdomainsockets} - -In the Hurd on Mach, there was a global pflocal server that provided -unix domain sockets and pipes to all users. This will not work very -well in the Hurd on L4, because for descriptor passing, read: -capability passing, the unix domain socket server needs to accept -capabilities in transit. User capabilities are often implemented by -untrusted servers, though, and thus a global pflocal server running as -root can not accept them. - -However, unix domain sockets and pipes can not be implemented locally -in the task. An external task is needed to hold buffered data -capabilities in transit. in theory, a new task could be used for -every pipe or unix domain socketpair. However, in practice, one -server for each user would suffice and perform better. - -This works, because access to Unix Domain Sockets is controlled via -the filesystem, and access to pipes is controlled via file -descriptors, usually by inheritance. For example, if a fifo is -installed as a passive translator in the filesystem, the first user -accessing it will create a pipe in his pflocal server. From then on, -an active translator must be installed in the node that redirects any -other users to the right pflocal server implementing this fifo. This -is asymmetrical in that the first user to access a fifo will implement -it, and thus pay the costs for it. But it does not seem to cause any -particular problems in implementing the POSIX semantics. - -The GNU C library can contact ~/servers/socket/pflocal to implement -socketpair, or start a pflocal server for this task's exclusive use if -that node does not exist. - -All this are optimizations: It should work to have one pflocal process -for each socketpair. However, performance should be better with a -shared pflocal server, one per user. - - -\subsection{Pipes} - -Pipes are implemented using \texttt{socketpair()}, that means as -unnamed pair of Unix Domain Sockets. The \texttt{pflocal} server will -support this by implementing pipe semantics on the socketpair if -requested. - -\begin{comment} - It was considered to use shared memory for the pipe implementation. - But we are not aware of a lock-free protocol using shared memory - with multiple readers and multiple writers. It might be possible, - but it is not obvious if that would be faster: Pipes are normally - used with \texttt{read()} and \texttt{write()}, so the data has to - be copied from and to the supplied buffer. This can be done - efficiently in L4 even across address spaces using string items. In - the implementation using sockets, the \texttt{pflocal} server - handles concurrent read and write accesses with mutual exclusion. -\end{comment} - - -\subsection{Filesystems} - -\subsubsection{Directory lookup across filesystems} -\label{xfslookup} - -The Hurd has the ability to let users mount filesystems and other -servers providing a filesystem-like interface. Such filesystem -servers are called translators. In the Hurd on GNU Mach, the parent -filesystem would automatically start up such translators from passive -translator settings in the inode. It would then block until the child -filesystem sends a message to its bootstrap port (provided by the -parent fs) with its root directory port. This root directory port can -then be given to any client looking up the translated node. - -There are several things wrong with this scheme, which becomes -apparent in the Hurd on L4. The parent filesystem must be careful to -not block on creating the child filesystem task. It must also be -careful to not block on receiving any acknowledgement or startup -message from it. Furthermore, it can not accept the root directory -capability from the child filesystem and forward it to clients, as -they are potentially not trusted. - -The latter problem can be solved the following way: The filesystem -knows about the server thread in the child filesystem. It also -implements an authentication capability that represents the ability to -access the child filesystem. This capability is also given to the -child filesystem at startup (or when it attaches itself to the parent -filesystem). On client dir\_lookup, the parent filesystem can return -the server\_thread and the authentication capability to the client. -The client can use that to initiate a connection with the child -filesystem (by first building up a connection, then sending the -authentication capability from the parent filesystem, and receiving a -root directory capability in exchange). - -\begin{comment} - There is a race here. If the child filesystem dies and the parent - filesystem processes the task death notification and releases the - task info cap for the child before the user acquires its own task - info cap for the child, then an imposter might be able to pretend to - be the child filesystem for the client. - - This race can only be avoided by a more complex protocol: - - Variant 1: The user has to acquire the task info cap for the child - fs, and then it has to perform the lookup again. If then the thread - ID is for the task it got the task ID for in advance, it can go on. - If not, it has to retry. This is not so good because a directory - lookup is usually an expensive operation. However, it has the - advantage of only slowing down the rare case. - - Variant 2: The client creates an empty reference container in the - task server, which can then be used by the server to fill in a - reference to the child's task ID. However, the client has to create - and destroy such a container for every filesystem where it excepts - it could be redirected to another (that means: for all filesystems - for which it does not use \verb/O_NOTRANS/). This is quite an - overhead to the common case. - -\begin{verbatim} -<marcus> I have another idea -<marcus> the client does not give a container -<marcus> server sees child fs, no container -> returns O_NOTRANS node -<marcus> then client sees error, uses O_NOTRANS node, "" and container -<marcus> problem solved -<marcus> this seems to be the optimum -<neal> hmm. -<neal> So lazily supply a container. -<marcus> yeah -<neal> Hoping you won't need one. -<marcus> and the server helps you by doing as much as it can usefully -<neal> And that is the normal case. -<neal> Yeah, that seems reasonable. -<marcus> the trick is that the server won't fail completely -<marcus> it will give you at least the underlying node -\end{verbatim} -\end{comment} - -The actual creation of the child filesystem can be performed much like -a suid exec, just without any client to follow up with further -capabilities and startup info. The only problem that remains is how -the parent filesystem can know which thread in the child filesystem -implements the initial handshake protocol for the clients to use. The -only safe way here seems to be that the parent filesystem requires the -child to use the main thread for that, or that the parent filesystem -creates a second thread in the child at startup (passing its thread ID -in the startup data), requiring that this second thread is used. In -either case the parent filesystem will know the thread ID in advance -because it created the thread in the first place. This looks a bit -ugly, and violates good taste, so we might try to look for alternative -solutions. - - -\subsubsection{Reparenting} -\label{reparenting} - -The Hurd on Mach contains a curious RPC, \verb/file_reparent/, which -allows you to create a new capability for the same node, with the -difference that the new node will have a supplied capability as its -parent node. A directory lookup of \texttt{..} on this new capability -would return the provided parent capability. - -This function is used by the \texttt{chroot()} function, which sets -the parent node to the null capability to prevent escape from a -\texttt{chroot()} environment. It is also used by the -\texttt{firmlink} translator, which is a cross over of a symbolic and -a hard link: It works like a hard link, but can be used across -filesystems. - -A firmlink is a dangerous thing. Because the filesystem will give no -indication if the parent node it returns is provided by itself or some -other, possibly untrusted filesystem, the user might follow the parent -node to untrusted filesystems without being aware of it. - -In the Hurd port to L4, the filesystem can not accept untrusted parent -capabilities on behalf of the user anymore. The \texttt{chroot()} -function is not difficult to implement anyway, as no real capability -is required. The server can just be instructed to create a node with -no parent node, and it can do that without problems. Nevertheless, we -also want a secure version of the \texttt{firmlink} translator. This -is possible if the same strategy is used as in cross filesystem -lookups. The client registers a server thread as the handler for the -parent node, and the filesystem returns a capability that can be used -for authentication purposes. Now, the client still needs to connect -this to the new parent node. Normally, the filesystem providing the -new parent node will also not trust the other filesystem, and thus can -not accept the capability that should be used for authentication -purposes. So instead creating a direct link from the one filesystem -to the other, the firmlink translator must act as a middle man, and -redirect all accesses to the parent node first to itself, and then to -the filesystem providing the parent node. For this, it must request a -capability from that filesystem that can be used for authentication -purposes when bootstrapping a connection, that allows such a -bootstrapping client to access the parent node directly. - -This also fixes the security issues, because now any move away from -the filesystem providing the reparented node will explicitely go first -to the \texttt{firmlink} translator, and then to the filesystem -providing the parent node. The user can thus make an informed -decision if it trusts the \texttt{firmlink} translator and the -filesystem providing the parent node. - -\begin{comment} - This is a good example where the redesign of the IPC system forces - us to fix a security issue and provides a deeper insight into the - trust issues and how to solve them. -\end{comment} - - -\section{Debugging} -\label{debug} - -L4 does not support debugging. So every task has to implement a debug -interface and implement debugging locally. gdb needs to be changed to -make use of this interface. How to perform the required -authentication, and how the debug thread is advertised to gdb, and how -the debug interface should look like, are all open questions. - - -\section{Device Drivers} - -This section written by Peter De Schrijver and Daniel Wagner. - -\subsection{Requirements} - - \begin{itemize} - \item Performance: Speed is important! - \item Portability: Framework should work on different architectures. - - Also: Useable in a not hurdisch environment with only - small changes. - - \item Flexibility - \item Convenient interfaces - \item Consistency - \item Safety: driver failure should have as minimal system impact as - possible. - \end{itemize} - -\subsection{Overview} - - The framework consists of: - \begin{itemize} - \item Bus drivers - \item Device drivers - \item Service servers (plugin managers, $\omega_0$, rootserver) - \end{itemize} - -\subsubsection{Drivers and the filesystem} - - The device driver framework will only offer a physical device view. - Ie. it will be a tree with devices as the leaves connected by - various bus technologies. Any logical view and naming persistence - will have to be build on top of this (translator). - -\subsubsection{Layer of the drivers} - - The device driver framework consists only of the lower level drivers - and doesn't need to have a complicated scheme for access control. - This is because it should be possible to share devices, e.g. for - neighbour Hurd. The authentication is done by installing a virtual - driver in each OS/neighour Hurd. The driver framework trusts these - virtual drivers. So it's possible for a non Hurdish system to use - the driver framework just by implementing these virtual drivers. - - Only threads which have registered as trusted are allowed to access - device drivers. The check is simply done by checking the senders - ID against a table of known threads. - -\subsubsection{Address spaces} - - Drivers always reside in their own AS. The overhead for cross AS IPC - is small enough to do so. - -\subsubsection{Zero copying and DMA} - - It is assumed that there are no differences between physical memory - pages. For example each physical memory page can be used for DMA - transfers. Of course, older hardware like ISA devices can so not be - supported. Who cares? - - With this assumption, the device driver framework can be given any - physical memory page for DMA operation. This physical memory page - must be pinned down. - - If an application wants to send or receive data to/from a device - driver it has to tell the virtual driver the page on which the - operation has to be executed. Since the application doesn't know - the virtual-real memory mapping, it has to ask the physical memory - manager for the real memory address of the page in question. If the - page is not directly mapped from the physical memory manager the - application ask the mapper (another application which has mapped - this memory region the first application) to resolve the mapping. - This can be done recursively. Normally, this resolving of mapping - can be speed up using a cache services, since a small number of - pages are reused very often. - - With the scheme, the drivers do not have to take special care of - zero copying if there is only one virtual driver. When there is - more than one virtual driver pages have to copied for all other - virtual drivers. - -\subsubsection{Root bus driver} - - The root bus is the entrypoint to look up devices. - - XXX There should be iterators/visitors for operating on - busses/devices. (daniel) - -\subsubsection{Physical versus logical device view} - - The device driver framework will only offer a physical device view. - Ie. it will be a tree with devices as the leaves connected by - various bus technologies. Any logical view and naming persistence - will have to be build on top of this (translator). - -\subsubsection{Things for the future} - - \begin{itemize} - \item Interaction with the task server (e.g. listings driver threads - with ps,etc.) - \item Powermanagement - \end{itemize} - -\subsection{Bus Drivers} - -A bus driver is responsible to manage the bus and provide access to -devices connected to it. In practice it means a bus driver has to -perform the following tasks: - -\begin{itemize} -\item Handle hotplug events - - Busses which do not support hotplugging, will treated as if there is - 1 insertion event for every device connected to it when the bus - driver is started. Drivers which don't support autoprobing of - devices will probably have to read some configuration data from a - file or if the driver is a needed for bootstrapping configuration - can be given as argument on its stack. In some cases the bus - doesn't generate insertion/removal events, but can still support - some form of hotplug functionality if the user tells the driver when - a change to the bus configuration has happened (eg. SCSI). - -\item Configure client device drivers - - The bus driver should start the appropriate client device driver - translator when an insertion event is detected. It should also - provide the client device driver with all necessary configuration - info, so it can access the device it needs. This configuration data - typically consists of the bus addresses of the device and possibly - IRQ numbers or DMA channel ID's. The device driver is loaded by the - assotiatet plugin manager. - -\item Provide access to devices - - This means the bus driver should be able to perform a bus - transaction on behalf of a client device driver. In some cases this - involves sending a message and waiting for reply (eg. SCSI, USB, - IEEE 1394, Fibre Channel,...). The driver should provide - send/receive message primitives in this case. In other cases - devices on the bus can be accessed by doing a memory accesses or by - using special I/O instructions. In this case the driver should - provide mapping and unmapping primitives so a client device driver - can get access to the memory range or is allowed to access the I/O - addresses. The client device driver should use a library, which is - bus dependant, to access the device on the bus. This library hides - the platform specific details of accessing the bus. - - Furthermore the bus driver must also support rescans for hardware. - It might be that not all drivers are found during bootstrapping and - hence later on drivers could be loaded. This is done by regenerate - new attach notification sending to bus's plugin manager. The plugin - manager loads then if possible a new driver. A probe funtion is not - needed since all supported hardware can be identified by - vendor/device identifactions (unlike ISA hardware). For hardware - busses which don't support such identifaction (ISA) only static - configuration is possible (configuration scripts etc.) -\end{itemize} - - -\subsubsection{Plugin Manager} - - Each bus driver has a handle/reference to which insert/remove events - are send. The owner of the handle/refence must then take - appropriate action like loading the drivers. These actors are - called plugin managers. - -\subsubsection{Generic Bus Driver} - - Operations: - \begin{itemize} - \item notify (attach, detach) - \item string enumerate - \end{itemize} - - XXX Extract generic bus services from the PCI Bus Driver section - which could be also be used other PCI related busses (ISA) be used. - The name for this service is missleading, since a SCSI Bus Driver - does not have anything in common with a PCI bus. (daniel) - -\subsubsection{ISA Bus Driver} -Inherits from: - -\begin{itemize} -\item Generic Bus Driver -\end{itemize} - -Operations: -\begin{itemize} -\item (none) -\end{itemize} - -XXX The interface has not been defined up to now. (daniel) - - -\subsubsection{PCI Bus Driver} - -Inherits from: -\begin{itemize} -\item Generic Bus Driver -\end{itemize} - -Operations: -\begin{itemize} -\item map\_mmio: map a PCI BAR for MMIO -\item map\_io: map a PCI BAR for I/O -\item map\_mem: map a PCI BAR for memory -\item read\_mmio\_{8,16,32,64}: read from a MMIO register -\item write\_mmio\_{8,16,32,64}: write to a MMIO register -\item read\_io\_{8,16,32,64}: read from an IO register -\item write\_io\_{8,16,32,64}: write to an IO register -\item read\_config\_{8,16,32,?}: read from a PCI config register -\item write\_config\_{8,16,32,?}: write to a PCI config register -\item alloc\_dma\_mem(for non zero copying): allocate main memory useable for DMA -\item free\_dma\_mem (for non zero copying): free main memory useable for DMA -\item prepare\_dma\_read: write back CPU cachelines for DMAable memory area -\item sync\_dma\_write: discard CPU cachelines for DMAable memory area -\item alloc\_consistent\_mem: allocate memory which is consistent between CPU - and device -\item free\_consistent\_mem: free memory which - is consistent between CPU and device -\item get\_irq\_mapping (A,B,C,D): get the IRQ matching the INT(A,B,C,D) line -\end{itemize} - -\subsection{Device Drivers} -\subsubsection{Classes} -\begin{itemize} -\item character: This the standard tty as known in the Unix environment. -\item block -\item human input: Keyboard, mouse, ... -\item packet switched network -\item circuit switched network -\item framebuffer -\item streaming audio -\item streaming video -\item solid state storage: flash memory -\end{itemize} - -\subsubsection{Human input devices (HID) and the console} - -The HIDs and the console are critical for user interaction with the -system. Furthmore, the console should be working as soons as possible -to give feedback. Log messages which are send to the console before -the hardware has been initialized should be buffered. - -\subsubsection{Generic Device Driver} -Operations: -\begin{itemize} -\item init : prepare hardware for use -\item start : start normal operation -\item stop : stop normal operation -\item deinit : shutdown hardware -\item change\_irq\_peer : change peer thread to propagate irq message to. -\end{itemize} - - -\subsubsection{ISA Devices} -Inherits from: -\begin{itemize} -\item Generic Device Driver -\end{itemize} - -Supported devices -\begin{itemize} -\item Keyboard (ps2) -\item serial port (mainly for debugging purposses) -\item parallel port -\end{itemize} - -XXX interface definition for each device driver is missing. (daniel) - - -\subsubsection{PCI Devices} -Inherits from: -\begin{itemize} -\item Generic Device Driver -\end{itemize} - -Supported devices: -\begin{itemize} -\item block devices -\item ... -\end{itemize} - -XXX interface definition for each device driver is missing. (daniel) - - -\subsection{Resource Management} - - -\subsubsection{IRQ handling} - -\paragraph{IRQ based interrupt vectors} - -Some CPU architectures (eg 68k, IA32) can directly jump to an -interrupt vector depending on the IRQ number. This is typically the -case on CISC CPU's. In this case there is some priorization scheme. On -IA32 for example, the lowest IRQ number has the highest priority. -Sometimes the priorities are programmable. Most RISC CPU's have only -a few interrupt vectors which are connected external IRQs. (typically -1 or 2). This means the IRQ handler should read a register in the -interrupt controller to determine which IRQ handler has to be -executed. Sometimes the hardware assists here by providing a register -which indicates the highest priority interrupt according to some -(programmable) scheme. - -\paragraph{IRQ acknowlegdement} - -The IRQ acknowledgement is done in two steps. First inform the -hardware about the successful IRQ acceptance. Then inform the ISRs -about the IRQ event. - -\paragraph{Edge versus level triggered IRQs} - -Edge triggered IRQs typically don't need explicit acknowledgment by -the CPU at the device level. You can just acknowledge them at the -interrupt controller level. Level triggered IRQs typically need to -explicitly acknowledged by the CPU at the device level. The CPU has to -read or write a register from the IRQ generating peripheral to make -the IRQ go away. If this is not done, the IRQ handler will be -reentered immediatly after it ended, effectively creating an endless -loop. Another way of preventing this would be to mask the IRQ. - -\paragraph{Multiple interrupt controllers} - -Some systems have multiple interrupt controllers in cascade. This is -for example the case on a PC, where you have 2 8259 interrupt -controllers. The second controller is connected to the IRQ 2 pin of -the first controller. It is also common in non PC systems which still -use some standard PC components such as a Super IO controller. In this -case the 2 8259's are connected to 1 pin of the primary interrupt -controller. Important for the software here is that you need to -acknowledge IRQ's at each controller. So to acknowledge an IRQ from -the second 8259 connected to the first 8259 connected to another -interrupt controller, you have to give an ACK command to each of those -controllers. Another import fact is that on PC architecture the order -of the ACKs is important. - -\paragraph{Shared IRQs} - -Some systems have shared IRQs. In this case the IRQ handler has to -look at all devices using the same IRQ... - -\paragraph{IRQ priorities} - -All IRQs on L4 have priorities, so if an IRQ occurs any IRQ lower then -the first IRQ will be blocked until the first IRQ has been -acknowlegded. ISR priorities must much the hardware priority (danger -of priority inversion). Furthermore the IRQ acknowledgment order is -important. - -The 8259 also supports a specific IRQ acknowledge iirc. But, this -scheme does not work in most level triggered IRQ environments. In -these environments you must acknowledge (or mask) the IRQ before -leaving the IRQ handler, otherwise the CPU will immediately reenter -the IRQ handler, effectively creating an endless loop. In this case L4 -would have to mask the IRQ. The IRQ thread would have to unmask it -after acknowledgement and processing. - -\paragraph{IRQ handling by L4/x86} - -The L4 kernel does handle IRQ acknowlegdment. - - -\subsubsection{$\omega_0$} - -$\omega_0$ is a system-central IRQ-logic server. It runs in the -privileged AS space in order to be allowed rerouting IRQ IPC. - -If an IRQ is shared between several devices, the drivers are daisy -chained and have to notify their peers if an IRQ IPC has arrived. - -XXX For more detail see XXX URL missing - -Operations: -\begin{itemize} -\item attach\_irq : attach an ISR thread to the IRQ -\item detach\_irq : detach an ISR thread form the IRQ -\end{itemize} - - -\subsubsection{Memory} -If no physical memory pages are provided by the OS the device driver -framework alloces pages from the physical memory manager. The device -driver framework has at no point of time to handle any virtual to -physical page mapping. - - -\subsection{Bootstrapping} - -A simpleFS provides initial drivers for bootstraping. The root bus -driver and simpleFS is loaded by grub as module. It then signals for -loading new (bus) drivers. As before if there is no driver avaible -for some reason for the device, the bus driver doesn't change the -device state and waits for a notifaction that there are new drivers -avaible. This simpleFS might be based on BSD libstand (library for -standalone applications). simpleFS doesn't need to be writeable -either. - - -\subsubsection{Plugin Manager} -A Plugin manager handles driver loading for devices. It searches for -driver in seach pathes (on filesystems). It's possible to add new -search pathes later. This allows the system to bootstrap with only -one search path (the simpleFS). When the search path is changed, the -device tree will be scanned for devices which don't have a driver -loaded yet. If a driver has become available, it will be loaded. - - -\subsection{Order of implementation} - -\begin{enumerate} -\item rootserver, plugin server -\item root bus server -\item pci bus -\item isa bus -\item serial port (isa bus) -\item console -\end{enumerate} - +\include{introduction} +\include{booting} +\include{ipc} +\include{threads-tasks} +\include{vmm} +\include{authentication} +\include{posix} +\include{debugging} +\include{device-drivers} \end{document} diff --git a/doc/introduction.tex b/doc/introduction.tex new file mode 100644 index 0000000..57a28f8 --- /dev/null +++ b/doc/introduction.tex @@ -0,0 +1,44 @@ +\chapter{Introduction} + +The GNU Hurd is a multi-server operating system running on top of a +microkernel (currently Mach variants). The core motivation of the +Hurd is the following: + +\begin{quote} + \emph{The operating system should enable its users to share the + resources of the system without harming each other.} +\end{quote} + +The focus is on the user, the system should try to allow the user to +do anything that is not harmful for other users. Many operating +systems either restrict what the user can do to be more secure, while +others allow the user to do everything, but fail on protecting the +users from each other effectively. + +The Hurd is designed to minimize the system code that the user is +required to use, while allowing the user to use, ignore or replace the +remaining system code, and this without harming other users. + +So while the L4 microkernel tries to minimize the policy that the +kernel enforces on the software running on it, the Hurd tries to +minimize the policy that the operating system enforces on its users. +Furthermore, the Hurd also aims to provide a POSIX compatible general +purpose operating system. However, this POSIX personality of the Hurd +is provided for convenience only, and to make the Hurd useful. Other +personalities can be implemented and used by the users of the system +along with the POSIX personality. This default personality of the +Hurd also provides some convenient features that allow the user to +extend the system so that all POSIX compatible programs can take +advantage of it. + +These notes are a moving target in the effort to find the best +strategy to port the Hurd to the L4 microkernel. + +\begin{comment} + Remarks about the history of a certain feature and implementation + details are set in a smaller font and separated from the main text, + just like this paragraph. Because this is work in progress, there + are naturally a lot of such comments. +\end{comment} + + diff --git a/doc/ipc.tex b/doc/ipc.tex new file mode 100644 index 0000000..522faf5 --- /dev/null +++ b/doc/ipc.tex @@ -0,0 +1,1126 @@ +\chapter{Inter-process communication (IPC)} +\label{ipc} + +The Hurd requires a capability system. Capabilities are used to proof +your identity to other servers (authentication), and access +server-side implemented objects like devices, files, directories, +terminals, and other things. The server can use a capability for +whatever it wants. Capabilities provide interfaces. Interfaces can +be invoked by sending messages to the capability. In L4, this means +that a message is sent to a thread in the server providing the +capability, with the identifier for the capability in the message. + +Capabilities are protected objects. Access to a capability needs to +be granted by the server. Once you have a capability, you can copy it +to other tasks (if the server permits it, which is usually the case). +In the Hurd, access to capabilities is always granted to a whole task, +not to individual threads. + +\begin{comment} + There is no reason for the server not to permit it, because the + holder of the capability could also just act as a proxy for the + intended receiver instead copying the capability to it. The + operation might fail anyway, for example because of resource + shortage, in particular if the server puts a quota on the number of + capabilities a user can hold. +\end{comment} + +Capabilities provide two essential services to the Hurd. They are +used to restrict access to a server function, and they are the +standard interface the components in the Hurd use to communicate with +each others. Thus, it is important that their implementation is fast +and secure. + +\begin{comment} + There are several ways to implement such a capability system. A + more traditional design would be a global, trusted capability server + that provides capabilities to all its users. The L4 redirector + could be used to reroute all client traffic automatically through + this server. This approach has several disadvantages: + + \begin{itemize} + \item It adds a lot of overhead to every single RPC, because all + traffic has to be routed through the capability server, which must + then perform the authentication on the server's behalf. + \item It would be difficult to copy a capability to another task. + Either the cap server would have to provide interfaces for clients + to do it, or it would be have to know the message format for every + interface and do it automatically. + \item It would be a single point of failure. If it had a bug and + crashed, the whole system would be affected. + \item Users could not avoid it, it would be enforced system code. + \item It is inflexible. It would be hard to replace or extend at + run-time. + \end{itemize} + + Another approach is taken by CORBA with IORs. IORs contain long + random numbers which allow the server to identify a user of an + object. This approach is not feasible for the following reasons: + + \begin{itemize} + \item Even good random numbers can be guessed. Long enough random + numbers can reduce the likelihood to arbitrary small numbers, + though (below the probability of a hardware failure). + \item Good random numbers are in short supply, and is slow to + generate. Good pseudo random is faster, but it is still difficult + to generate. The random number generator would become a critical + part of the operating system. + \item The random number had to be transfered in every single + message. Because it would have to be long, it would have a + significant negative impact on IPC performance. + \end{itemize} +\end{comment} + +The Hurd implements the capability system locally in each task. A +common default implementation will be shared by all programs. +However, a malicious untrusted program can do nothing to disturb the +communication of other tasks. A capability is identified in the +client task by the server thread and a local identifier (which can be +different from client to client). The server thread will receive +messages for the capabilities. The first argument in the message is +the capability identifier. Although every task can get different IDs +for the same capability, a well-behaving server will give the same ID +to a client which already has a capability and gets the same +capability from another client. So clients can compare capability IDs +from the server numerically to check if two capabilities are the same, +but only if one of the two IDs is received while the client already +had the other one. + +Because access to a capability must be restricted, the server needs to +be careful in only allowing registered and known users to access the +capability. For this, the server must be sure that it can determine +the sender of a message. In L4, this is easy on the surface: The +kernel provides the receiving thread with the sender's thread ID, +which also contains the task ID in the version field. However, the +server must also know for sure if this task is the same task that it +gave access to the capability. Comparing the task IDs numerically is +not good enough, the server must also somehow have knowledge or +influence on how task IDs are reused when tasks die and are created. + +The same is true for the client, of course, which trusts the server +and thus must be sure that it is not tricked into trusting on +unreliable data from an imposter, or sends sensitive data to it. + +\begin{comment} + The \texttt{task} server wants to reuse thread numbers because that + makes best use of kernel memory. Reusing task IDs, the version + field of a thread ID, is not so important, but there are only 14 + bits for the version field (and the lower six bits must not be all + zero). So a thread ID is bound to be reused eventually. + + Using the version field in a thread ID as a generation number is not + good enough, because it is so small. Even on 64-bit architectures, + where it is 32 bit long, it can eventually overflow. +\end{comment} + +The best way to prevent that a task can be tricked into talking to an +imposter is to have the \texttt{task} server notify the task if the +communication partner dies. The \texttt{task} server must guarantee +that the task ID is not reused until all tasks that got such a +notification acknowledge that it is processed, and thus no danger of +confusion exists anymore. + +The \texttt{task} server provides references to task IDs in form of +\emph{task info capabilities}. If a task has a task info capability +for another task, it prevents that this other task's task ID is reused +even if that task dies, and it also makes sure that task death +notifications are delivered in that case. + +\begin{comment} + Because only the \texttt{task} server can create and destroy tasks, + and assign task IDs, there is no need to hold such task info + capabilities for the \texttt{task} server, nor does the + \texttt{task} server need to hold task info capabilities for its + clients. This avoids the obvious bootstrap problem in providing + capabilities in the \texttt{task} server. This will even work if + the \texttt{task} server is not the real \texttt{task} server, but a + proxy task server (see section \ref{proxytaskserver} on page + \pageref{proxytaskserver}). +\end{comment} + +As task IDs are a global resource, care has to be taken that this +approach does not allow for a DoS-attack by exhausting the task ID +number space, see section \ref{taskinfocap} on page +\pageref{taskinfocap} for more details. + + +\section{Capabilities} + +This subsection contains implementation details about capabilities. + +A server will usually operate on objects, and not capabilities. In +the case of a filesystem, this could be file objects, for example. + +\begin{comment} + In the Hurd, filesystem servers have to keep different objects for + each time a file is looked up (or ``opened''), because some state, + for example authentication, open flags and record locks, are + associated not with the file directly, but with this instance of + opening the file. Such a state structure (``credential'') will also + contain a pointer and reference to the actual file node. For + simplicity, we will assume that the capability is associated with a + file node directly. +\end{comment} + +To provide access to the object to another task, the server creates a +capability, and associates it with the object (by setting a hook +variable in the capability). From this capability, the server can +either create send references to itself, or to other tasks. If the +server creates send references for itself, it can use the capability +just as it can use capabilities implemented by other servers. This +makes access to locally and remotely implemented capabilities +identical. If you write code to work on capabilities, it can be used +for remote objects as well as for local objects. + +If the server creates a send reference for another task (a client), a +new capability ID will be created for this task. This ID will only be +valid for this task, and should be returned to the client. + +The client itself will create a capability object from this capability +ID. The capability will also contain information about the server, +for example the server thread which should be used for sending +messages to the capability. + +If the client wants to send a message, it will send it to the provided +server thread, and use the capability ID it got from the server as the +first argument in the RPC. The server receives the message, and now +has to look up the capability ID in the list of capabilties for this +task. + +\begin{comment} + The server knows the task ID from the version field of the sender's + thread ID. It can look up the list of capabilities for this task in + a hash table. The capability ID can be an index into an array, so + the server only needs to perform a range check. This allows to + verify quickly that the user is allowed to access the object. + + This is not enough if several systems run in parallel on the same + host. Then the version ID for the threads in the other systems will + not be under the control of the Hurd's \texttt{task} server, and can + thus not be trusted. The server can still use the version field to + find out the task ID, which will be correct \emph{if the thread is + part of the same subsystem}. It also has to verify that the + thread belongs to this subsystem. Hopefully the subsystem will be + encoded in the thread ID. Otherwise, the \texttt{task} server has + to be consulted (and, assuming that thread numbers are not shared by + the different systems, the result can be cached). +\end{comment} + +The server reads out the capability associated with the capability ID, +and invokes the server stub according to the message ID field in the +message. + +After the message is processed, the server sends it reply to the +sender thread with a zero timeout. + +\begin{comment} + Servers must never block on sending messages to clients. Even a + small timeout can be used for DoS-attacks. The client can always + make sure that it receives the reply by using a combined send and + receive operation together with an infinite timeout. +\end{comment} + +The above scheme assumes that the server and the client already have +task info caps for the respective other task. This is the normal +case, because acquiring these task info caps is part of the protocol +that is used when a capability is copied from one task to another. + + +\subsection{Bootstrapping a client-server connection} +\label{ipcbootstrap} + +If the client and the server do not know about each other yet, then +they can bootstrap a connection without support from any other task +except the \texttt{task} server. The purpose of the initial handshake +is to give both participants a chance to acquire a task info cap for +the other participants task ID, so they can be sure that from there on +they will always talk to the same task as they talked to before. + +\subsubsection{Preconditions} +The client knows the thread ID of the server thread that receives and +processes the bootstrap messages. Some other task might hold a task +info capability to the server the client wants to connect to. + +\begin{comment} + If no such other tasks exists, the protocol will still work. + However, the client might not get a connection to the server that + run at the time the client started the protocol, but rather to the + server that run at the time the client acquired the task info cap + for the server's task ID (after step 1 below). + + This is similar to how sending signals works in Unix: Technically, + at the time you write \texttt{kill 203}, and press enter, you do not + know if the process with the PID 203 you thought of will receive the + signal, or some other process that got the PID in the time between + you getting the information about the PID and writing the + \texttt{kill}-command. +\end{comment} + +FIXME: Here should be the pseudo code for the protocol. For now, you +have to take it out of the long version. + +\begin{enumerate} + +\item The client acquires a task info capability for the server's task + ID, either directly from the \texttt{task} server, or from another + task in a capability copy. From that point on, the client can be + sure to always talk to the same task when talking to the server. + + Of course, if the client already has a task info cap for the server + it does not need to do anything in this step. + +\begin{comment} + As explained above, if the client does not have any other task + holding the task info cap already, it has no secure information + about what this task is for which it got a task info cap. +\end{comment} + +\item The client sends a message to the server, requesting the initial + handshake. + +\item The server receives the message, and acquires a task info cap + for the client task (directly from the \texttt{task} server). + + Of course, if the server already has a task info cap for the client + it does not need to do anything in this step. + +\begin{comment} + At this point, the server knows that future messages from this task + will come from the same task as it got the task info cap for. + However, it does not know that this is the same task that sent the + initial handshake request in step 2 above. This shows that there is + no sense in verifying the task ID or perform any other + authentication before acquiring the task info cap. +\end{comment} + +\item The server replies to the initial handshake request with an + empty reply message. + +\begin{comment} + Because the reply now can go to a different task than the request + came from, sending the reply might fail. It might also succeed and + be accepted by the task that replaced the requestor. Or it might + succeed normally. The important thing is that it does not matter to + the server at all. It would have provided the same ``service'' to + the ``imposter'' of the client, if he had bothered to do the + request. As no authentication is done yet, there is no point for + the server to bother. + + This means however, that the server needs to be careful in not + consuming too many resources for this service. However, this is + easy to achieve. Only one task info cap per client task will ever + be held in the server. The server can either keep it around until + the task dies (and a task death notification is received), or it can + clean it up after some timeout if the client does not follow up and + do some real authentication. +\end{comment} + +\item The client receives the reply message to its initial handshake + request. + +\item The client sends a request to create its initial capability. + How this request looks depends on the type of the server and the + initial capabilities it provides. Here are some examples: + + \begin{itemize} + \item A filesystem might provide an unauthenticated root directory + object in return of the underlying node capability, which is + provided by the parent filesystem and proves to the filesystem + that the user was allowed to look up the root node of this + filesystem (see section \ref{xfslookup} on page + \pageref{xfslookup}). + + \begin{comment} + In this example, the parent filesystem will either provide the + task info cap for the child filesystem to the user, or it will + hold the task info cap while the user is creating their own + (which the user has to verify by repeating the lookup, though). + Again, see section \ref{xfslookup} on page \pageref{xfslookup}. + + The unauthenticated root directory object will then have the be + authenticated using the normal reauthentication mechanism (see + section \ref{auth} on pageref{auth}). This can also be combined + in a single RPC. + \end{comment} + + \item Every process acts as a server that implements the signal + capability for this process. Tasks who want to send a signal to + another task can perform the above handshake, and then provide + some type of authentication capability that indicates that they + are allowed to send a signal. Different authentication + capabilities can be accepted by the signalled task for different + types of signals. + + \begin{comment} + The Hurd used to store the signal capability in the proc server, + where authorized tasks could look it up. This is no longer + possible because a server can not accept capabilities + implemented by untrusted tasks, see below. + \end{comment} + \end{itemize} + +\item The server replies with whatever capability the client + requested, provided that the client could provide the necessary + authentication capabilities, if any. + + \begin{comment} + It is not required that the server performs any authentication at + all, but it is recommended, and all Hurd servers will do so. + + In particular, the server should normally only allow access from + tasks running in the same system, if running multiple systems on + the same host is possible. + \end{comment} +\end{enumerate} + +\subsubsection{Result} +The client has a task info capability for the server and an +authenticated capability. The server has a task info capability for +the client and seen some sort of authentication for the capability it +gave to the client. + +\begin{comment} + If you think that the above protocol is complex, you have seen + nothing yet! Read on. +\end{comment} + + +\subsection{Returning a capability from a server to a client} + +Before we go on to the more complex case of copying a capability from +one client to another, let us point out that once a client has a +capability from a server, it is easy for the server to return more +capabilities it implements to the client. + +The server just needs to create the capability, acquire a capability +ID in the client's cap ID space, and return the information in the +reply RPC. + +FIXME: Here should be the pseudo code for the protocol. For now, you +have to take it out of the long version. + +\begin{comment} + The main point of this section is to point out that only one task + info capability is required to protect all capabilities provided to + a single task. The protocols described here always assume that no + task info caps are held by anyone (except those mentioned in the + preconditions). In reality, sometimes the required task info caps + will already be held. +\end{comment} + + +\subsection{Copying a capability from one client to another task} + +The most complex operation in managing capabilities is to copy or move +a capability from the client to another task, which subsequently +becomes a client of the server providing the capability. The +difficulty here lies in the fact that the protocol should be fast, but +also robust and secure. If any of the participants dies unexpectedly, +or any of the untrusted participants is malicious, the others should +not be harmed. + +\subsubsection{Preconditions} +The client $C$ has a capability from server $S$ (this implies that $C$ +has a task info cap for $S$ and $S$ has a task info cap for $C$). It +wants to copy the capability to the destination task $D$. For this, +it will have to make RPCs to $D$, so $C$ has also a capability from +$D$ (this implies that $C$ has a task info cap for $D$ and $D$ has a +task info cap for $C$). Of course, the client $C$ trusts its servers +$S$ and $D$. $D$ might trust $S$ or not, and thus accept or reject +the capability that $C$ wants to give to $D$. $S$ does not trust +either $C$ or $D$. + +The \texttt{task} server is also involved, because it provides the +task info capabilities. Everyone trusts the \texttt{task} server they +use. This does not need to be the same one for every participant. + +FIXME: Here should be the pseudo code for the protocol. For now, you +have to take it out of the long version. + +\begin{enumerate} +\item The client invokes the \verb/cap_ref_cont_create/ RPC on the + capability, providing the task ID of the intended receiver $D$ of + the capability. + +\item The server receives the \verb/cap_ref_cont_create/ RPC from the + client. It requests a task info cap for $D$ from its trusted task + server, under the constraint that $C$ is still living. + + \begin{comment} + A task can provide a constraint when creating a task info cap in + the \texttt{task} server. The constraint is a task ID. The task + server will only create the task info cap and return it if the + task with the constraint task ID is not destroyed. This allows + for a task requesting a task info capability to make sure that + another task, which also holds this task info cap, is not + destroyed. This is important, because if a task is destroyed, all + the task info caps it held are released. + + In this case, the server relies on the client to hold a task info + cap for $D$ until it established its own. See below for what can + go wrong if the server would not provide a constraint and both, + the client and the destination task would die unexpectedly. + \end{comment} + + Now that the server established its own task info cap for $D$, it + creates a reference container for $D$, that has the following + properties: + + \begin{itemize} + \item The reference container has a single new reference for the + capability. + + \item The reference container has an ID that is unique among all + reference container IDs for the client $C$. + + \item The reference container is associated with the client $C$. If + $C$ dies, and the server processes the task death notification for + it, the server will destroy the reference container and release + the capability reference it has (if any). All resources + associated with the reference container will be released. If this + reference container was the only reason for $S$ to hold the task + info cap for $D$, the server will also release the task info cap + for $D$. + + \item The reference container is also associated with the + destination task $D$. If $D$ dies, and the server processes the + task death notification for it, the server will release the + capability reference that is in the reference container (if any). + It will not destroy the part of the container that is associated + with $C$. + \end{itemize} + + The server returns the reference container ID $R$ to the client. + +\item The client receives the reference container ID $R$. + + \begin{comment} + If several capabilities have to be copied in one message, the + above steps need to be repeated for each capability. With + appropriate interfaces, capabilities could be collected so that + only one call per server has to be made. We are assuming here + that only one capability is copied. + \end{comment} + +\item The client sends the server thread ID $T$ and the reference + container ID $R$ to the destination task $D$. + +\item The destination task $D$ receives the server thread ID $T$ and + the reference container ID $R$ from $C$. + + It now inspects the server thread ID $T$, and in particular the task + ID component of it. $D$ has to make the decision if it trusts this + task to be a server for it, or if it does not trust this task. + + If $D$ trusts $C$, it might decide to always trust $T$, too, + irregardless of what task contains $T$. + + If $D$ does not trust $C$, it might be more picky about the task + that contains $T$. This is because $D$ will have to become a client + of $T$, so it will trust it. For example, it will block on messages + it sends to $T$. + + \begin{comment} + If $D$ is a server, it will usually only accept capabilities from + its client that are provided by specific other servers it trusts. + This can be the authentication server, for example (see section + \ref{auth} on page \pageref{auth}). + + Usually, the type of capability that $D$ wants to accept from $C$ + is then further restricted, and only one possible trusted server + implements that type of capabilities. Thus, $D$ can simply + compare the task ID of $T$ with the task ID of its trusted server + (authentication server, ...) to make the decision if it wants to + accept the capability or not. + \end{comment} + + If $D$ does not trust $T$, it replies to $C$ (probably with an error + value indicating why the capability was not accepted). In that + case, jump to step \ref{copycapout}. + + Otherwise, it requests a task info cap for $S$ from its trusted task + server, under the constraint that $C$ is still living. + + Then $D$ sends a \verb/cap_ref_cont_accept/ RPC to the server $S$, + providing the task ID of the client $C$ and the reference container + ID $R$. + +\begin{comment} + \verb/cap_ref_cont_accept/ is one of the few interfaces that is not + sent to a (real) capability, of course. Nevertheless, it is part of + the capability object interface, hence the name. You can think of + it as a static member in the capability class, that does not require + an instance of the class. +\end{comment} + +\item The server receives the \verb/cap_ref_cont_accept/ RPC from the + destination task $D$. It verifies that a reference container exists + with the ID $R$, that is associated with $D$ and $C$. + + \begin{comment} + The server will store the reference container in data structures + associated with $C$, under an ID that is unique but local to $C$. + So $D$ needs to provide both information, the task ID and the + reference container ID of $C$. + \end{comment} + + If that is the case, it takes the reference from the reference + container, and creates a capability ID for $D$ from it. The + capability ID for $D$ is returned in the reply message. + + From that moment on, the reference container is deassociated from + $D$. It is still associated with $C$, but it does not contain any + reference for the capability. + + \begin{comment} + It is not deassociated from $C$ and removed completely, so that + its ID $R$ (or at least the part of it that is used for $C$) is + not reused. $C$ must explicitely destroy the reference container + anyway because $D$ might die unexpectedly or return an error that + gives no indication if it accepted the reference or not. + \end{comment} + +\item The destination task $D$ receives the capability ID and enters + it into its capability system. It sends a reply message to $C$. + + \begin{comment} + If the only purpose of the RPC was to copy the capability, the + reply message can be empty. Usually, capabilities will be + transfered as part of a larger operation, though, and more work + will be done by $D$ before returning to $C$. + \end{comment} + +\item \label{copycapout} The client $C$ receives the reply from $D$. + Irregardless if it indicated failure or success, it will now send + the \verb/cap_ref_cont_destroy/ message to the server $S$, providing + the reference container $R$. + + \begin{comment} + This message can be a simple message. It does not require a reply + from the server. + \end{comment} + +\item The server receives the \verb/cap_ref_cont_destroy/ message and + removes the reference container $R$. The reference container is + deassociated from $C$ and $D$. If this was the only reason that $S$ + held a task info cap for $D$, this task info cap is also released. + + \begin{comment} + Because the reference container can not be deassociated from $C$ + by any other means than this interface, the client does not need + to provide $D$. $R$ can not be reused without the client $C$ + having it destroyed first. This is different from the + \verb/cap_ref_cont_accept/ call made by $D$, see above. + \end{comment} + +\end{enumerate} + +\subsubsection{Result} +For the client $C$, nothing has changed. The destination task $D$ +either did not accept the capability, and nothing has changed for it, +and also not for the server $S$. Or $D$ accepted the capability, and +it now has a task info cap for $S$ and a reference to the capability +provided by $S$. In this case, the server $S$ has a task info cap for +$D$ and provides a capability ID for this task. + +The above protocol is for copying a capability from $C$ to $D$. If +the goal was to move the capability, then $C$ can now release its +reference to it. + +\begin{comment} + Originally we considered to move capabilities by default, and + require the client to acquire an additional reference if it wanted + to copy it instead. However, it turned out that for the + implementation, copying is easier to handle. One reason is that the + client usually will use local reference counting for the + capabilities it holds, and with local reference counting, one + server-side reference is shared by many local references. In that + case, you would need to acquire a new server-side reference even if + you want to move the capability. The other reason is cancellation. + If an RPC is cancelled, and you want to back out of it, you need to + restore the original situation. And that is easier if you do not + change the original situation in the first place until the natural + ``point of no return''. +\end{comment} + +The above protocol quite obviously achieves the result as described in +the above concluding paragraph. However, many other, and often +simpler, protocols would also do that. The other protocols we looked +at are not secure or robust though, or require more operations. To +date we think that the above is the shortest (in particular in number +of IPC operations) protocol that is also secure and robust (and if it +is not we think it can be fixed to be secure and robust with minimal +changes). We have no proof for its correctness. Our confidence comes +from the scrutiny we applied to it. If you find a problem with the +above protocol, or if you can prove various aspects of it, we would +like to hear about it. + +To understand why the protocol is laid out as it is, and why it is a +secure and robust protocol, one has to understand what could possibly +go wrong and why it does not cause any problems for any participant if +it follows its part of the protocol (independent on what the other +participants do). In the following paragraphs, various scenarios are +suggested where things do not go as expected in the above protocol. +This is probably not a complete list, but it should come close to it. +If you find any other problematic scenario, again, let us know. + +\begin{comment} + Although some comments like this appear in the protocol description + above, many comments have been spared for the following analysis of + potential problems. Read the analysis carefully, as it provides + important information about how, and more importantly, why it works. +\end{comment} + +\subsubsection{The server $S$ dies} +What happens if the server $S$ dies unexpectedly sometime throughout +the protocol? + +\begin{comment} + At any time a task dies, the task info caps it held are released. + Also, task death notifications are sent to any task that holds task + info caps to the now dead task. The task death notifications will + be processed asynchrnouly, so they might be processed immediately, + or at any later time, even much later after the task died! So one + important thing to keep in mind is that the release of task info + caps a task held, and other tasks noticing the task death, are + always some time apart. +\end{comment} + +Because the client $C$ holds a task info cap for $S$ no imposter can +get the task ID of $S$. $C$ and $D$ will get errors when trying to +send messages to $S$. + +\begin{comment} + You might now wonder what happens if $C$ also dies, or if $C$ is + malicious and does not hold the task info cap. You can use this as + an exercise, and try to find the answer on your own. The answers + are below. +\end{comment} + +Eventually, $C$ (and $D$ if it already got the task info cap for $S$) +will process the task death notification and clean up their state. + +\subsubsection{The client $C$ dies} +The server $S$ and the destination task $D$ hold a task info cap for +$C$, so no imposter can get its task ID. $S$ and $D$ will get errors +when trying to send messages to $C$. Depending on when $C$ dies, the +capability might be copied successfully or not at all. + +Eventually, $S$ and $D$ will process the task death notification and +release all resources associated with $C$. If the reference was not +yet copied, this will include the reference container associated with +$C$, if any. If the reference was already copied, this will only +include the empty reference container, if any. + +\begin{comment} + Of course, the participants need to use internal locking to protect + the integrity of their internal data structures. The above protocol + does not show where locks are required. In the few cases where some + actions must be performed atomically, a wording is used that + suggests that. +\end{comment} + +\subsubsection{The destination task $D$ dies} + +The client $C$ holds a task info cap for $D$ over the whole operation, +so no imposter can get its task ID. Depending on when $D$ dies, it +has either not yet accepted the capability, then $C$ will clean up by +destroying the reference container, or it has, and then $S$ will clean +up its state when it processes the task death notification for $D$. + +\subsubsection{The client $C$ and the destination task $D$ die} + +This scenario is the reason why the server acquires its own task info +cap for $D$ so early, and why it must do that under the constraint +that $C$ still lives. If $C$ and $D$ die before the server created +the reference container, then either no request was made, or creating +the task info cap for $D$ fails because of the constraint. If $C$ and +$D$ die afterwards, then no imposter can get the task ID of $D$ and +try to get at the reference in the container, because the server has +its own task info cap for $D$. + +\begin{comment} + This problem was identified very late in the development of this + protocol. We just did not think of both clients dieing at the same + time! In an earlier version of the protocol, the server would + acquire its task info cap when $D$ accepts its reference. This is + too late: If $C$ and $D$ die just before that, an imposter with + $D$'s task ID can try to get the reference in the container before + the server processes the task death notification for $C$ and + destroys it. +\end{comment} + +Eventually, the server will receive and process the task death +notifications. If it processes the task death notification for $C$ +first, it will destroy the whole container immediately, including the +reference, if any. If it processes the task death notification for +$D$ first, it will destroy the reference, and leave behind the empty +container associated with $C$, until the other task death notification +is processed. Either way no imposter can get at the capability. + +Of course, if the capability was already copied at the time $C$ and +$D$ die, the server will just do the normal cleanup. + +\subsubsection{The client $C$ and the server $S$ die} + +This scenario does not cause any problems, because on the one hand, +the destination task $D$ holds a task info cap for $C$, and it +acquires its own task info cap for $S$. Although it does this quite +late in the protocol, it does so under the constraint that $C$ still +lives, which has a task info cap for $S$ for the whole time (until it +dies). It also gets the task info cap for $S$ before sending any +message to it. An imposter with the task ID of $S$, which it was +possible to get because $C$ died early, would not receive any message +from $D$ because $D$ uses $C$ as its constraint in acquireing the task +info cap for $S$. + +\subsubsection{The destination task $D$ and the server $S$ die} + +As $C$ holds task info caps for $S$ and $D$, there is nothing that can +go wrong here. Eventually, the task death notifications are +processed, but the task info caps are not released until the protocol +is completed or aborted because of errors. + +\subsubsection{The client $C$, the destination task $D$ and the server $S$ die} + +Before the last one of these dies, you are in one of the scenarios +which already have been covered. After the last one dies, there is +nothing to take care of anymore. + +\begin{comment} + In this case your problem is probably not the capability copy + protocol, but the stability of your software! Go fix some bugs. +\end{comment} + +So far the scenarios where one or more of the participating tasks die +unexpectedly. They could also die purposefully. Other things that +tasks can try to do purposefully to break the protocol are presented +in the following paragraphs. + +\begin{comment} + A task that tries to harm other tasks by not following a protocol + and behaving as other tasks might expect it is malicious. Beside + security concerns, this is also an issue of robustness, because + malicious behaviour can also be triggered by bugs rather than bad + intentions. + + It is difficult to protect against malicious behaviour by trusted + components, like the server $S$, which is trusted by both $C$ and + $D$. If a trusted component is compromised or buggy, ill + consequences for software that trusts it must be expected. Thus, no + analysis is provided for scenarious involving a malicious or buggy + server $S$. +\end{comment} + +\subsubsection{The client $C$ is malicious} + +If the client $C$ wants to break the protocol, it has numerous +possibilities to do so. The first thing it can do is to provide a +wrong destination task ID when creating the container. But in this +case, the server will return an error to $D$ when it tries to accept +it, and this will give $D$ a chance to notice the problem and clean +up. This also would allow for some other task to receive the +container, but the client can give the capability to any other task it +wants to anyway, so this is not a problem. + +\begin{comment} + If a malicious behaviour results in an outcome that can also be + achieved following the normal protocol with different parameters, + then this not a problem at all. +\end{comment} + +The client could also try to create a reference container for $D$ and +then not tell $D$ about it. However, a reference container should not +consume a lot of resources in the server, and all such resources +should be attributed to $C$. When $C$ dies eventually, the server +will clean up any such pending containers when the task death +notification is processed. + +The same argument holds when $C$ leaves out the call to +\verb/cap_ref_cont_destroy/. + +The client $C$ could also provide wrong information to $D$. It could +supply a wrong server thread ID $T$. It could supply a wrong +reference container ID $R$. If $D$ does not trust $C$ and expects a +capability implemented by some specific trusted server, it will verify +the thread ID numerically and reject it if it does not match. The +reference container ID will be verified by the server, and it will +only be accepted if the reference container was created by the client +task $C$. Thus, the only wrong reference container IDs that the +client $C$ could use to not provoke an error message from the server +(which then lead $D$ to abort the operation) would be a reference +container that it created itself in the first place. However, $C$ +already is frree to send $D$ any reference container it created. + +\begin{comment} + Again $C$ can not achieve anything it could not achieve by just + following the protocol as well. If $C$ tries to use the same + reference container with several RPCs in $D$, one of them would + succeed and the others would fail, hurting only $C$. + + If $D$ does trust $C$, then it can not protect against malicious + behaviour by $C$. +\end{comment} + +To summarize the result so far: $C$ can provide wrong data in the +operations it does, but it can not achieve anything this way that it +could not achieve by just following the protocol. In most cases the +operation would just fail. If it leaves out some operations, trying +to provoke resource leaks in the server, it will only hurt itself (as +the reference container is strictly associated with $C$ until the +reference is accepted by $D$). + +\begin{comment} + For optimum performance, the server should be able to keep the + information about the capabilities and reference containers a client + holds on memory that is allocated on the clients behalf. + + It might also use some type of quota system. +\end{comment} + +Another attack that $C$ can attempt is to deny a service that $S$ and +$D$ are expecting of it. Beside not doing one or more of the RPCs, +this is in particular holding the task info caps for the time span as +described in the protocol. Of course, this can only be potentially +dangerous in combination with a task death. If $C$ does not hold the +server task info capability, then an imposter of $S$ could trick $D$ +into using the imposter as the server. However, this is only possible +if $D$ already trusts $C$. Otherwise it would only allow servers that +it already trusts, and it would always hold task info caps to such +trusted servers when making the decision that it trusts them. +However, if $D$ trusts $C$, it can not protect against $C$ being +malicious. + +\begin{comment} + If $D$ does not trust $C$, it should only ever compare the task ID + of the server thread against trusted servers it has a task info cap + for. It must not rely on $C$ doing that for $D$. + + However, if $D$ does trust $C$, it can rely on $C$ holding the + server task info cap until it got its own. Thus, the task ID of $C$ + can be used as the constraint when acquiring the task info cap in + the protocol. +\end{comment} + +If $C$ does not hold the task info cap of $D$, and $D$ dies before the +server acquires its task info cap for $D$, it might get a task info +cap for an imposter of $D$. But if the client wants to achieve that, +it could just follow the protocol with the imposter as the destination +task. + +\subsubsection{The destination task $D$ is malicious} + +The destination task has not as many possibilities as $C$ to attack +the protocol. This is because it is trusted by $C$. So the only +participant that $D$ can try to attack is the server $S$. But the +server $S$ does not rely on any action by $D$. $D$ does not hold any +task info caps for $S$. The only operation it does is an RPC to $S$ +accepting the capability, and if it omits that it will just not get +the capability (the reference will be cleaned up by $C$ or by the +server when $C$ dies). + +The only thing that $D$ could try is to provide false information in +the \verb/cap_ref_cont_accept/ RPC. The information in that RPC is +the task ID of the client $C$ and the reference container ID $R$. The +server will verify that the client $C$ has previously created a +reference container with the ID $R$ that is destined for $D$. So $D$ +will only be able to accept references that it is granted access to. +So it can not achieve anything that it could not achieve by following +the protocol (possibly the protocol with another client). If $D$ +accepts capabilities from other transactions outside of the protocol, +it can only cause other transactions in its own task to fail. + +\begin{comment} + If you can do something wrong and harm yourself that way, then this + is called ``shooting yourself in your foot''. + + The destination task $D$ is welcome to shoot itself in its foot. +\end{comment} + +\subsubsection{The client $C$ and the destination task $D$ are malicious} + +The final question we want to raise is what can happen if the client +$C$ and the destination task $D$ are malicious. Can $C$ and $D$ +cooperate and attacking $S$ in a way that $C$ or $D$ alone could not? + +In the above analysis, there is no place where we assume any specific +behaviour of $D$ to help $S$ in preventing an attack on $S$. There is +only one place where we make an assumption for $C$ in the analysis of +a malicious $D$. If $D$ does not accept a reference container, we +said that $C$ would clean it up by calling +\verb/cap_ref_cont_destroy/. So we have to look at what would happen +if $C$ were not to do that. + +Luckily, we covered this case already. It is identical to the case +where $C$ does not even tell $D$ about the reference container and +just do nothing. In this case, as said before, the server will +eventually release the reference container when $C$ dies. Before +that, it only occupies resources in the server that are associated +with $C$. + +This analysis is sketchy in parts, but it covers a broad range of +possible attacks. For example, all possible and relevant combinations +of task deaths and malicious tasks are covered. Although by no means +complete, it can give us some confidence about the rightness of the +protocol. It also provides a good set of test cases that you can test +your own protocols, and improvements to the above protocol against. + + +\subsection{The trust rule} + +The protocol to copy a capability from one client to another task has +a dramatic consequence on the design of the Hurd interfaces. + +Because the receiver of the capability must make blocking calls to the +server providing the capability, the receiver of the capability +\emph{must} trust the server providing the capability. + +This means also: If the receiver of a capability does not trust the +server providing the capability, it \emph{must not} accept it. + +The consequence is that normally, servers can not accept capabilities +from clients, unless they are provided by a specific trusted server. +This can be the \texttt{task} or \texttt{auth} server for example. + +This rule is even true if the receiver does not actually want to use +the capability for anything. Just accepting the capability requires +trusting the server providing it already. + +In the Hurd on Mach, ports (which are analogous to capabilities in +this context) can be passed around freely. There is no security risk +in accepting a port from any source, because the kernel implements +them as protected objects. Using a port by sending blocking messages +to it requires trust, but simply storing the port on the server side +does not. + +This is different in the Hurd on L4: A server must not accept +capabilities unless it trusts the server providing them. Because +capabilities are used for many different purposes (remote objects, +authentication, identification), one has to be very careful in +designing the interfaces. The Hurd interfaces on Mach use ports in a +way that is not possible on L4. Such interfaces need to be +redesigned. + +Often, redesigning such an interface also fixes some other security +problems that exists with in the Hurd on L4, in particular DoS +attacks. A good part of this paper is about redesigning the Hurd to +avoid storing untrusted capabilities on the server side. + +\begin{comment} + Examples are: + + \begin{itemize} + \item The new authentication protocol, which eliminates the need for + a rendezvous port and is not only faster, but also does not + require the server to block on the client anymore (see section + \ref{auth} on page \pageref{auth}). + + \item The signal handling, which does not require the \texttt{proc} + server to hold the signal port for every task anymore (see section + \ref{signals} on page \pageref{signals}). + + \item The new exec protocol, which eliminates the need to pass all + capabilities that need to be transfered to the new executable from + the old program to the filesystem server, and then to the + \texttt{exec} server (see section \ref{exec} on page + \pageref{exec}). + + \item The new way to implement Unix Domain Sockets, which don't + require a trusted system server, so that descriptor passing (which + is really capability passing) can work (see section + \ref{unixdomainsockets} on page \pageref{unixdomainsockets}. + + \item The way parent and child filesystem are linked to each other, + in other words: how mounting a filesystem works (see section + \ref{xfslookup} on page \pageref{xfslookup}). + + \item The replacement for the \verb/file_reparent()/ RPC (see + section \ref{reparenting} on page \pageref{reparenting}). + \end{itemize} +\end{comment} + +\section{Synchronous IPC} + +The Hurd only needs synchronous IPC. Asynchronous IPC is usually not +required. An exception are notifications (see below). + +There are possibly some places in the Hurd source code where +asynchronous IPC is assumed. These must be replaced with different +strategies. One example is the implementation of select() in the GNU +C library. + +\begin{comment} + A naive implementation would use one thread per capability to select + on. A better one would combine all capabilities implemented by the + same server in one array and use one thread per server. + + A more complex scheme might let the server process select() calls + asynchronously and report the result back via notifications. +\end{comment} + +In other cases the Hurd receives the reply asynchronously from sending +the message. This works fine in Mach, because send-once rights are +used as reply ports and Mach guarantees to deliver the reply message, +ignoring the kernel queue limit. In L4, no messages are queued and +such places need to be rewritten in a different way (for example using +extra threads). + +\begin{comment} + What happens if a client does not go into the receive phase after a + send, but instead does another send, and another one, quickly many + sends, as fast as possible? A carelessly written server might + create worker threads for each request. Instead, the server should + probably reject to accept a request from a client thread that + already has a pending request, so the number of worker threads is + limited to the number of client threads. + + This also makes interrupting an RPC operation easier (the client + thread ID can be used to identify the request to interrupt). +\end{comment} + + +\section{Notifications} + +Notifications to untrusted tasks happen frequently. One case is +object death notifications, in particular task death notifications. +Other cases might be select() or notifications of changes to the +filesystem. + +The console uses notifications to broadcast change events to the +console content, but it also uses shared memory to broadcast the +actual data, so not all notifications need to be received for +functional operation. Still, at least one notification is queued by +Mach, and this is sufficient for the console to wakeup whenever +changes happened, even if the changes can not be processed +immediately. + +From the servers point of view, notifications are simply messages with +a send and xfer timeout of 0 and without a receive phase. + +For the client, however, there is only one way to ensure that it will +receive the notification: It must have the receiving thread in the +receive phase of an IPC. While this thread is processing the +notification (even if it is only delegating it), it might be preempted +and another (or the same) server might try to send a second +notification. + +\begin{comment} + It is an open challenge how the client can ensure that it either + receives the notification or at least knows that it missed it, while + the server remains save from potential DoS attacks. The usual + strategy, to give receivers of notifications a higher scheduling + priority than the sender, is not usable in a system with untrusted + receivers (like the Hurd). The best strategy determined so far is + to have the servers retry to send the notification several times + with small delays inbetween. This can increase the chance that a + client is able to receive the notification. However, there is still + the question what a server can do if the client is not ready. + + An alternative might be a global trusted notification server that + runs at a higher scheduling priority and records which servers have + notifications for which clients, and that can be used by clients to + be notified of pending notifications. Then the clients can poll the + notifications from the servers. +\end{comment} + + diff --git a/doc/posix.tex b/doc/posix.tex new file mode 100644 index 0000000..953f1ea --- /dev/null +++ b/doc/posix.tex @@ -0,0 +1,403 @@ +\chapter{The POSIX personality} + +The Hurd offers a POSIX API to the user by default. This is +implemented in the GNU C library which uses the services provided by +the Hurd servers. Several system servers support the C library. + + +\section{Process Management} +\label{proc} + +The \texttt{proc} server implements Unix process semantics in the Hurd +system. It will also assign a PID to each task that was created with +the \texttt{task} server, so that the owner of these tasks, and the +system administrator, can at least send the \verb/SIGKILL/ signal to +them. + +The \texttt{proc} server uses the task manager capability from the +\texttt{task} server to get hold of the information about all tasks +and the task control caps. + +\begin{comment} + The \texttt{proc} server might also be the natural place to + implement a first policy server for the \texttt{task} server. +\end{comment} + + +\subsection{Signals} +\label{signals} + +Each process can register the thread ID of a signal thread with the +\texttt{proc} server. The proc server will give the signal thread ID +to any other task which asks for it. + +\begin{comment} + The thread ID can be guessed, so there is no point in protecting it. +\end{comment} + +The signal thread ID can then be used by a task to contact the task to +which it wants to send a signal. The task must bootstrap its +connection with the intended receiver of the signal, according to the +protocol described in section \ref{ipcbootstrap} on page +\pageref{ipcbootstrap}. As a result, it will receive the signal +capability of the receiving task. + +The sender of a signal must then provide some capability that proves +that the sender is allowed to send the signal when a signal is posted +to the signal capability. For example, the owner of the task control +cap is usually allowed to send any signal to it. Other capabilities +might only give permission to send some types of signals. + +\begin{comment} + The receiver of the signal decides itself which signals to accept + from which other tasks. The default implementation in the C library + provides POSIX semantics, plus some extensions. +\end{comment} + +Signal handling is thus completely implemented locally in each task. +The \texttt{proc} server only serves as a name-server for the thread +IDs of the signal threads. + +\begin{comment} + The \texttt{proc} server can not hold the signal capability itself, + as it used to do in the implementation on Mach, as it does not trust + the tasks implementing the capability. But this is not a problem, + as the sender and receiver of a signal can negotiate and bootstrap + the connection without any further support by the \texttt{proc} + server. + + Also, the \texttt{proc} server can not even hold task info caps to + support the sender of a signal in bootstrapping the connection. + This means that there is a race between looking up the signal thread + ID from the PID in the \texttt{proc} server and acquiring a task + info cap for the task ID of the signal receiver in the sender. + However, in Unix, there is always a race when sending a signal using + \verb/kill/. The task server helps the users a bit here by not + reusing task IDs as long as possible. +\end{comment} + +Some signals are not implemented by sending a message to the task. +\verb/SIGKILL/ for example destroys the tasks without contacting it at +all. This feature is implemented in the \texttt{proc} server. + +The signal capability is also used for other things, like the message +interface (which allows you to manipulate the environment variables +and \texttt{auth} capability of a running task, etc). + + +\subsection{The \texttt{fork()} function} + +To be written. + + +\subsection{The \texttt{exec()} function} +\label{exec} + +The \texttt{exec()} operation will be done locally in a task. +Traditionally, \texttt{exec()} overlays the same task with a new +process image, because creating a new task and transferring the +associated state is expensive. In L4, only the threads and virtual +memory mappings are actually kernel state associated with a task, and +exactly those have to be destroyed by \texttt{exec()} anyway. There +is a lot of Hurd specific state associated with a task (capabilities, +for example), but it is difficult to preserve that. There are +security concerns, because POSIX programs do not know about Hurd +features like capabilities, so inheriting all capabilities across +\texttt{exec()} unconditionally seems dangerous. + +\begin{comment} + One could think that if a program is not Hurd-aware, then it will + not make any use of capabilities except through the normal POSIX + API, and thus there are no capabilities except those that the GNU C + library uses itself, which \texttt{exec()} can take care of. + However, this is only true if code that is not Hurd-aware is never + mixed with Hurd specific code, even libraries (unless the library + intimately cooperates with the GNU C library). This would be a high + barrier to enable Hurd features in otherwise portable programs and + libraries. + + It is better to make all POSIX functions safe by default and allow + for extensions to let the user specify which capabilities besides + those used for file descriptors etc to be inherited by the new + executable. + + For \verb/posix_spawn()/, this is straight-forward. For + \texttt{exec()}, it is not. either specific capabilities could be + markes as ``do not close on \texttt{exec()}'', or variants of the + \texttt{exec()} function could be provided which take further + arguments. +\end{comment} + +There are also implementation obstacles hindering the reuse of the +existing task. Only local threads can manipulate the virtual memory +mappings, and there is a lot of local state that has to be kept +somewhere between the time the old program becomes defunct and the new +binary image is installed and used (not to speak of the actual program +snippet that runs during the transition). + +So the decision was made to always create a new task with +\texttt{exec()}, and copy the desired state from the current task to +the new task. This is a clean solution, because a new task will +always start out without any capabilities in servers, etc, and thus +there is no need for the old task to try to destroy all unneeded +capabilities and other local state before \texttt{exec()}. Also, in +case the exec fails, the old program can continue to run, even if the +exec fails at a very late point (there is no ``point of no return'' +until the new task is actually up and running). + +For suid and sgid applications, the actual \texttt{exec()} has to be +done by the filesystem. However, the filesystem can not be bothered +to also transfer all the user state into the new task. It can not +even do that, because it can not accept capabilities implemented by +untrusted servers from the user. Also, the filesystem does not want +to rely on the new task to be cooperative, because it does not +necessarily trust the code, if is is owned by an untrusted user. + +\begin{enumerate} +\item The user creates a new task and a container with a single + physical page, and makes the \texttt{exec()} call to the file + capability, providing the task control capability. Before that, it + creates a task info capability from it for its own use. +\item The filesystem checks permission and then revokes all other + users on the task control capability. This will revoke the users + access to the task, and will fail if the user did not provide a + pristine task object. (It is assumed that the filesystem should not + create the task itself so the user can not use suid/sgid + applications to escape from their quota restriction). +\item Then it revokes access to the provided physical page and writes + a trusted startup code to it. +\item The filesystem will also prepare all capability transactions and + write the required information (together with other useful + information) in a stack on the physical page. +\item Then it creates a thread in the task, and starts it. At + pagefault, it will provide the physical page. +\item The startup code on the physical page completes the capability + transfer. It will also install a small pager that can install file + mappings for this binary image. Then it jumps to the entry point. +\item The filesystem in the meanwhile has done all it can do to help + the task startup. It will provide the content of the binary or + script via paging or file reads, but that happens asynchronously, + and as for any other task. So the filesystem returns to the client. +\item The client can then send its untrusted information to the new + task. The new task got the client's thread ID from the filesystem + (possibly provided by the client), and thus knows to which thread it + should listen. The new task will not trust this information + ultimatively (ie, the new task will use the authentication, root + directory and other capabilities it got from the filesystem), but it + will accept all capabilities and make proper use of them. +\item Then the new task will send a message to proc to take over the + old PID and other process state. How this can be done best is still + to be determined (likely the old task will provide a process control + capability to the new task). At that moment, the old task is + desrtoyed by the proc server. +\end{enumerate} + +This is a coarse and incomplete description, but it shows the general +idea. The details will depend a lot on the actual implementation. + + +\section{Unix Domain Sockets} +\label{unixdomainsockets} + +In the Hurd on Mach, there was a global pflocal server that provided +unix domain sockets and pipes to all users. This will not work very +well in the Hurd on L4, because for descriptor passing, read: +capability passing, the unix domain socket server needs to accept +capabilities in transit. User capabilities are often implemented by +untrusted servers, though, and thus a global pflocal server running as +root can not accept them. + +However, unix domain sockets and pipes can not be implemented locally +in the task. An external task is needed to hold buffered data +capabilities in transit. in theory, a new task could be used for +every pipe or unix domain socketpair. However, in practice, one +server for each user would suffice and perform better. + +This works, because access to Unix Domain Sockets is controlled via +the filesystem, and access to pipes is controlled via file +descriptors, usually by inheritance. For example, if a fifo is +installed as a passive translator in the filesystem, the first user +accessing it will create a pipe in his pflocal server. From then on, +an active translator must be installed in the node that redirects any +other users to the right pflocal server implementing this fifo. This +is asymmetrical in that the first user to access a fifo will implement +it, and thus pay the costs for it. But it does not seem to cause any +particular problems in implementing the POSIX semantics. + +The GNU C library can contact ~/servers/socket/pflocal to implement +socketpair, or start a pflocal server for this task's exclusive use if +that node does not exist. + +All this are optimizations: It should work to have one pflocal process +for each socketpair. However, performance should be better with a +shared pflocal server, one per user. + + +\section{Pipes} + +Pipes are implemented using \texttt{socketpair()}, that means as +unnamed pair of Unix Domain Sockets. The \texttt{pflocal} server will +support this by implementing pipe semantics on the socketpair if +requested. + +\begin{comment} + It was considered to use shared memory for the pipe implementation. + But we are not aware of a lock-free protocol using shared memory + with multiple readers and multiple writers. It might be possible, + but it is not obvious if that would be faster: Pipes are normally + used with \texttt{read()} and \texttt{write()}, so the data has to + be copied from and to the supplied buffer. This can be done + efficiently in L4 even across address spaces using string items. In + the implementation using sockets, the \texttt{pflocal} server + handles concurrent read and write accesses with mutual exclusion. +\end{comment} + + +\section{Filesystems} + +\subsection{Directory lookup across filesystems} +\label{xfslookup} + +The Hurd has the ability to let users mount filesystems and other +servers providing a filesystem-like interface. Such filesystem +servers are called translators. In the Hurd on GNU Mach, the parent +filesystem would automatically start up such translators from passive +translator settings in the inode. It would then block until the child +filesystem sends a message to its bootstrap port (provided by the +parent fs) with its root directory port. This root directory port can +then be given to any client looking up the translated node. + +There are several things wrong with this scheme, which becomes +apparent in the Hurd on L4. The parent filesystem must be careful to +not block on creating the child filesystem task. It must also be +careful to not block on receiving any acknowledgement or startup +message from it. Furthermore, it can not accept the root directory +capability from the child filesystem and forward it to clients, as +they are potentially not trusted. + +The latter problem can be solved the following way: The filesystem +knows about the server thread in the child filesystem. It also +implements an authentication capability that represents the ability to +access the child filesystem. This capability is also given to the +child filesystem at startup (or when it attaches itself to the parent +filesystem). On client dir\_lookup, the parent filesystem can return +the server\_thread and the authentication capability to the client. +The client can use that to initiate a connection with the child +filesystem (by first building up a connection, then sending the +authentication capability from the parent filesystem, and receiving a +root directory capability in exchange). + +\begin{comment} + There is a race here. If the child filesystem dies and the parent + filesystem processes the task death notification and releases the + task info cap for the child before the user acquires its own task + info cap for the child, then an imposter might be able to pretend to + be the child filesystem for the client. + + This race can only be avoided by a more complex protocol: + + Variant 1: The user has to acquire the task info cap for the child + fs, and then it has to perform the lookup again. If then the thread + ID is for the task it got the task ID for in advance, it can go on. + If not, it has to retry. This is not so good because a directory + lookup is usually an expensive operation. However, it has the + advantage of only slowing down the rare case. + + Variant 2: The client creates an empty reference container in the + task server, which can then be used by the server to fill in a + reference to the child's task ID. However, the client has to create + and destroy such a container for every filesystem where it excepts + it could be redirected to another (that means: for all filesystems + for which it does not use \verb/O_NOTRANS/). This is quite an + overhead to the common case. + +\begin{verbatim} +<marcus> I have another idea +<marcus> the client does not give a container +<marcus> server sees child fs, no container -> returns O_NOTRANS node +<marcus> then client sees error, uses O_NOTRANS node, "" and container +<marcus> problem solved +<marcus> this seems to be the optimum +<neal> hmm. +<neal> So lazily supply a container. +<marcus> yeah +<neal> Hoping you won't need one. +<marcus> and the server helps you by doing as much as it can usefully +<neal> And that is the normal case. +<neal> Yeah, that seems reasonable. +<marcus> the trick is that the server won't fail completely +<marcus> it will give you at least the underlying node +\end{verbatim} +\end{comment} + +The actual creation of the child filesystem can be performed much like +a suid exec, just without any client to follow up with further +capabilities and startup info. The only problem that remains is how +the parent filesystem can know which thread in the child filesystem +implements the initial handshake protocol for the clients to use. The +only safe way here seems to be that the parent filesystem requires the +child to use the main thread for that, or that the parent filesystem +creates a second thread in the child at startup (passing its thread ID +in the startup data), requiring that this second thread is used. In +either case the parent filesystem will know the thread ID in advance +because it created the thread in the first place. This looks a bit +ugly, and violates good taste, so we might try to look for alternative +solutions. + + +\subsection{Reparenting} +\label{reparenting} + +The Hurd on Mach contains a curious RPC, \verb/file_reparent/, which +allows you to create a new capability for the same node, with the +difference that the new node will have a supplied capability as its +parent node. A directory lookup of \texttt{..} on this new capability +would return the provided parent capability. + +This function is used by the \texttt{chroot()} function, which sets +the parent node to the null capability to prevent escape from a +\texttt{chroot()} environment. It is also used by the +\texttt{firmlink} translator, which is a cross over of a symbolic and +a hard link: It works like a hard link, but can be used across +filesystems. + +A firmlink is a dangerous thing. Because the filesystem will give no +indication if the parent node it returns is provided by itself or some +other, possibly untrusted filesystem, the user might follow the parent +node to untrusted filesystems without being aware of it. + +In the Hurd port to L4, the filesystem can not accept untrusted parent +capabilities on behalf of the user anymore. The \texttt{chroot()} +function is not difficult to implement anyway, as no real capability +is required. The server can just be instructed to create a node with +no parent node, and it can do that without problems. Nevertheless, we +also want a secure version of the \texttt{firmlink} translator. This +is possible if the same strategy is used as in cross filesystem +lookups. The client registers a server thread as the handler for the +parent node, and the filesystem returns a capability that can be used +for authentication purposes. Now, the client still needs to connect +this to the new parent node. Normally, the filesystem providing the +new parent node will also not trust the other filesystem, and thus can +not accept the capability that should be used for authentication +purposes. So instead creating a direct link from the one filesystem +to the other, the firmlink translator must act as a middle man, and +redirect all accesses to the parent node first to itself, and then to +the filesystem providing the parent node. For this, it must request a +capability from that filesystem that can be used for authentication +purposes when bootstrapping a connection, that allows such a +bootstrapping client to access the parent node directly. + +This also fixes the security issues, because now any move away from +the filesystem providing the reparented node will explicitely go first +to the \texttt{firmlink} translator, and then to the filesystem +providing the parent node. The user can thus make an informed +decision if it trusts the \texttt{firmlink} translator and the +filesystem providing the parent node. + +\begin{comment} + This is a good example where the redesign of the IPC system forces + us to fix a security issue and provides a deeper insight into the + trust issues and how to solve them. +\end{comment} + + diff --git a/doc/threads-tasks.tex b/doc/threads-tasks.tex new file mode 100644 index 0000000..07e691f --- /dev/null +++ b/doc/threads-tasks.tex @@ -0,0 +1,235 @@ +\chapter{Threads and Tasks} + +The \texttt{task} server will provide the ability to create tasks and +threads, and to destroy them. + +\begin{comment} + In L4, only threads in the privileged address space (the rootserver) + are allowed to manipulate threads and address spaces (using the + \textsc{ThreadControl} and \textsc{SpaceControl} system calls). The + \texttt{task} server will use the system call wrappers provided by + the rootserver, see section \ref{rootserver} on page + \pageref{rootserver}. +\end{comment} + +The \texttt{task} server provides three different capability types. + +\subsubsection{Task control capabilities} +If a new task is created, it is always associated with a task control +capability. The task control capability can be used to create and +destroy threads in the task, and destroy the task itself. So the task +control capability gives the owner of a task control over it. Task +control capabilities have the side effect that the task ID of this +task is not reused, as long as the task control capability is not +released. Thus, having a task control capability affects the global +namespace of task IDs. If a task is destroyed, task death +notifications are sent to holders of task control capabilities for +that task. + +\begin{comment} + A task is also implicitely destroyed when the last task control + capability reference is released. +\end{comment} + +\subsubsection{Task info capabilities} +\label{taskinfocap} +Any task can create task info capabilities for other tasks. Such task +info capabilities are used mainly in the IPC system (see section +\ref{ipc} on page \pageref{ipc}). Task info capabilities have the +side effect that the task ID of this task is not reused, as long as +the task info capability is not released. Thus, having a task info +capability affects the global namespace of task IDs. If a task is +destroyed, task death notifications are sent to holders of task info +capabilities for that task. + +\begin{comment} + Because of that, holding task info capabilities must be restricted + somehow. Several strategies can be taken: + + \begin{itemize} + \item Task death notifications can be monitored. If there is no + acknowdgement within a certain time period, the \texttt{task} + server could be allowed to reuse the task ID anyway. This is not + a good strategy because it can considerably weaken the security of + the system (capabilities might be leaked to tasks which reuse such + a task ID reclaimed by force). + \item The proc server can show dead task IDs which are not released + yet, in analogy to the zombie processes in Unix. It can also make + available the list of tasks which prevent reusing the task ID, to + allow users or the system administrator to clean up manually. + \item Quotas can be used to punish users which do not acknowledge + task death timely. For example, if the number of tasks the user + is allowed to create is restricted, the task info caps that the + user holds for dead tasks could be counted toward that limit. + \item Any task could be restricted to as many task ID references as + there are live tasks in the system, plus some slack. That would + prevent the task from creating new task info caps if it does not + release old ones from death tasks. The slack would be provided to + not unnecessarily slow down a task that processes task death + notifications asynchronously to making connections with new tasks. + \end{itemize} + + In particular the last two approaches should proof to be effective + in providing an incentive for tasks to release task info caps they + do not need anymore. +\end{comment} + +\subsubsection{Task manager capability} +A task is a relatively simple object, compared to a full blown POSIX +process, for example. As the \texttt{task} server is enforced system +code, the Hurd does not impose POSIX process semantics in the task +server. Instead, POSIX process semantics are implemented in a +different server, the proc server (see also section \ref{proc} on page +\pageref{proc}). To allow the \texttt{proc} server to do its work, it +needs to be able to get the task control capability for any task, and +gather other statistics about them. Furthermore, there must be the +possibility to install quota mechanisms and other monitoring systems. +The \texttt{task} server provides a task manager capability, that +allows the holder of that capability to control the behaviour of the +\texttt{task} server and get access to the information and objects it +provides. + +\begin{comment} + For example, the task manager capability could be used to install a + policy capability that is used by the \texttt{task} server to make + upcalls to a policy server whenever a new task or thread is created. + The policy server could then indicate if the creation of the task or + thread is allowed by that user. For this to work, the \texttt{task} + server itself does not need to know about the concept of a user, or + the policies that the policy server implements. + + Now that I am writing this, I realize that without any further + support by the \texttt{task} server, the policy server would be + restricted to the task and thread ID of the caller (or rather the + task control capability used) to make its decision. A more + capability oriented approach would then not be possible. This + requires more thought. + + The whole task manager interface is not written yet. +\end{comment} + +When creating a new task, the \texttt{task} server allocates a new +task ID for it. The task ID will be used as the version field of the +thread ID of all threads created in the task. This allows the +recipient of a message to verify the sender's task ID efficiently and +easily. + +\begin{comment} + The version field is 14 bit on 32-bit architectures, and 32 bit on + 64 bit architectures. Because the lower six bits must not be all + zero (to make global thread IDs different from local thread IDs), + the number of available task IDs is $2^{14} - 2^6$ resp. $2^{32} - + 2^6$. + + If several systems are running in parallel on the same host, they + might share thread IDs by encoding the system ID in the upper bits + of the thread number. +\end{comment} + +Task IDs will be reused only if there are no task control or info +capabilities for that task ID held by any task in the system. To +support bootstrapping an IPC connection (see section +\ref{ipcbootstrap} on page \pageref{ipcbootstrap}), the \texttt{task} +server will delay reusing a task ID as long as possible. + +\begin{comment} + This is similar to how PIDs are generated in Unix. Although it is + attempted to keep PIDs small for ease of use, PIDs are not reused + immediately. Instead, the PID is incremented up to a certain + maximum number, and only then smaller PID values are reused again. + + As task IDs are not a user interface, there is no need to keep them + small. The whole available range can be used to delay reusing a + task ID as long as possible. +\end{comment} + +When creating a new task, the \texttt{task} server also has to create +the initial thread. This thread will be inactive. Once the creation +and activation of the initial thread has been requested by the user, +it will be activated. When the user requests to destroy the last +thread in a task, the \texttt{task} server makes that thread inactive +again. + +\begin{comment} + In L4, an address space can only be implicitely created (resp. + destroyed) with the first (resp. last) thread in that address space. +\end{comment} + +Some operations, like starting and stopping threads in a task, can not +be supported by the task server, but have to be implemented locally in +each task because of the minimality of L4. If external control over +the threads in a task at this level is required, the debugger +interface might be used (see section \ref{debug} on page +\pageref{debug}). + + +\section{Accounting} + +We want to allow the users of the system to use the \texttt{task} +server directly, and ignore other task management facilities like the +\texttt{proc} server. However, the system administrator still needs +to be able to identify the user who created such anonymous tasks. + +For this, a simple accounting mechanism is provided by the task +server. An identifier can be set for a task by the task manager +capability, which is inherited at task creation time from the parent +task. This accounting ID can not be changed without the task manager +capability. + +The \texttt{proc} server sets the accounting ID to the process ID +(PID) of the task whenever a task registers itself with the +\texttt{proc} server. This means that all tasks which do not register +themself with the \texttt{proc} server will be grouped together with +the first parent task that did. This allows to easily kill all +unregistered tasks together with its registered parent. + +The \texttt{task} server does not interpret or use the accounting ID +in any way. + + +\section{Proxy Task Server} +\label{proxytaskserver} + +The \texttt{task} server can be safely proxied, and the users of such +a proxy task server can use it like the real \texttt{task} server, +even though capabilities work a bit differently for the \texttt{task} +server than for other servers. + +The problem exists because the proxy task server would hold the real +task info capabilities for the task info capabilities that it provides +to the proxied task. So if the proxy task server dies, all such task +info capabilities would be released, and the tasks using the proxy +task server would become insecure and open to attacks by imposters. + +However, this is not really a problem, because the proxy task server +will also provide proxy objects for all task control capabilities. So +it will be the only task which holds task control capabilities for the +tasks that use it. When the proxy task server dies, all tasks that +were created with it will be destroyed when these tak control +capabilities are released. The proxy task server is a vital system +component for the tasks that use it, just as the real \texttt{task} +server is a vital system component for the whole system. + + +\section{Scheduling} + +The task server is the natural place to implement a simple, initial +scheduler for the Hurd. A first version can at least collect some +information about the cpu time of a task and its threads. Later a +proper scheduler has to be written that also has SMP support. + +The scheduler should run at a higher priority than normal threads. + +\begin{comment} + This might require that the whole task server must run at a higher + priority, which makes sense anyway. + + Not much thought has been given to the scheduler so far. This is + work that still needs to be done. +\end{comment} + +There is no way to get at the ``system time'' in L4, it is assumed +that no time is spent in the kernel (which is mostly true). So system +time will always be reported as $0.00$, or $0.01$. + + diff --git a/doc/vmm.tex b/doc/vmm.tex new file mode 100644 index 0000000..a41c31e --- /dev/null +++ b/doc/vmm.tex @@ -0,0 +1,26 @@ +\chapter{Virtual Memory Management} + +Traditionally, monolithical kernels, but even kernels like Mach, +provide a virtual memory management system in the kernel. All paging +decisions are made by the kernel itself. This requires good +heuristics. Smart paging decisions are often not possible because the +kernel lacks the information about how the data is used. + +In the Hurd, paging will be done locally in each task. A physical +memory server provides a number of guaranteed physical pages to tasks. +It will also provide a number of excess pages (over-commit). The task +might have to return any number of excess pages on short notice. If +the task does not comply, all mappings are revoked (essentially +killing the task). + +A problem arises when data has to be exchanged between a client and a +server, and the server wants to have control over the content of the +pages (for example, pass it on to other servers, like device drivers). +The client can not map the pages directly into the servers address +space, as it is not trusted. Container objects created in the +physical memory server and mapped into the client and/or the servers +address space will provide the necessary security features to allow +this. This can be used for DMA and zero-copying in the data exchange +between device drivers and (untrusted) user tasks. + + |