summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorneal <neal>2003-09-07 21:49:29 +0000
committerneal <neal>2003-09-07 21:49:29 +0000
commit103f3951ba2e4ae7c1c39ad9c7d71a1df23586c9 (patch)
tree664e015e7626d3ad633b789a46c8ec5aeaab2077
parentb90d3c97c9ef64621f42e40258af9c571624f99f (diff)
/
2003-09-07 Neal H. Walfield <neal@cs.uml.edu> * configure.ac: Check for latex, dvips and ps2pdf. Bail if not found. Generate doc/Makefile. /doc/ Modularize the document by breaking each chapter into its own tex file. Integrate into the build system.
-rw-r--r--ChangeLog5
-rw-r--r--configure.ac21
-rw-r--r--doc/Makefile.am54
-rw-r--r--doc/authentication.tex158
-rw-r--r--doc/booting.tex271
-rw-r--r--doc/debugging.tex10
-rw-r--r--doc/device-drivers.tex422
-rw-r--r--doc/hurd-on-l4.tex2708
-rw-r--r--doc/introduction.tex44
-rw-r--r--doc/ipc.tex1126
-rw-r--r--doc/posix.tex403
-rw-r--r--doc/threads-tasks.tex235
-rw-r--r--doc/vmm.tex26
13 files changed, 2785 insertions, 2698 deletions
diff --git a/ChangeLog b/ChangeLog
index fbbd212..06b55f8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2003-09-07 Neal H. Walfield <neal@cs.uml.edu>
+
+ * configure.ac: Check for latex, dvips and ps2pdf. Bail if not
+ found. Generate doc/Makefile.
+
2003-07-26 Marcus Brinkmann <marcus@gnu.org>
* Initial check-in.
diff --git a/configure.ac b/configure.ac
index 15e4228..b0051a5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -34,6 +34,20 @@ AC_PROG_CC
AM_PROG_AS
AC_PROG_RANLIB
+# Required for building the documentation
+AC_PATH_PROG([LATEX], [latex], no)
+if test "x$LATEX" = xno; then
+ missing_progs="$missing_progs latex"
+fi
+AC_PATH_PROG([DVIPS], [dvips], no)
+if test "x$DVIPS" = xno; then
+ missing_progs="$missing_progs dvips"
+fi
+AC_PATH_PROG([PS2PDF], [ps2pdf], no)
+if test "x$PS2PDF" = xno; then
+ missing_progs="$missing_progs ps2pdf"
+fi
+
# Checks for libraries.
# Checks for header files.
@@ -56,10 +70,15 @@ esac
m4_include([libhurd-slab/headers.m4])
m4_include([libhurd-ihash/headers.m4])
+if test "x$missing_progs" != "x"; then
+ AC_MSG_ERROR([The following programs were not found:$missing_progs])
+fi
+
# Checks for library functions.
AC_CONFIG_FILES([Makefile
laden/Makefile
libl4/ia32/Makefile libl4/Makefile
libhurd-slab/Makefile
- libhurd-ihash/Makefile])
+ libhurd-ihash/Makefile
+ doc/Makefile])
AC_OUTPUT
diff --git a/doc/Makefile.am b/doc/Makefile.am
new file mode 100644
index 0000000..5ee392b
--- /dev/null
+++ b/doc/Makefile.am
@@ -0,0 +1,54 @@
+# Makefile.am - Makefile template for the manual.
+# Copyright (C) 2003 Free Software Foundation, Inc.
+# Written by Neal H. Walfield
+#
+# This file is part of the GNU Hurd.
+#
+# The GNU Hurd is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# The GNU Hurd is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+
+pkgdata_DATA = hurd-on-l4.dvi hurd-on-l4.ps hurd-on-l4.pdf
+
+# _DEPENDENCIES does not work with _DATA.
+hurd-on-l4.dvi: \
+ hurd-on-l4.tex \
+ introduction.tex \
+ booting.tex \
+ ipc.tex \
+ threads-tasks.tex \
+ vmm.tex \
+ authentication.tex \
+ posix.tex \
+ debugging.tex \
+ device-drivers.tex
+
+SUFFIXES = .ps .dvi .tex
+
+.tex.dvi:
+ $(LATEX) $< -o $@
+ while egrep "^LaTeX Warning:.*Rerun to" \
+ `echo $< | sed -e 's/.tex$$/.log/'`; do \
+ rm -f $(subst $<,.tex,.log); \
+ $(LATEX) $< -o $@; \
+ done
+
+.dvi.ps:
+ $(DVIPS) $<
+
+.ps.pdf:
+ $(PS2PDF) $<
+
+clean:
+ rm -f *.aux *.ps *.dvi *.pdf *.log *.toc
+
diff --git a/doc/authentication.tex b/doc/authentication.tex
new file mode 100644
index 0000000..817afa9
--- /dev/null
+++ b/doc/authentication.tex
@@ -0,0 +1,158 @@
+\chapter{Authentication}
+\label{auth}
+
+Capabilities are a good way to give access to protected objects and
+services. They are flexible, lightweight and generic. However, Unix
+traditionally uses access control lists (ACL) to restrict access to
+objects like files. Any task running with a certain user ID can
+access all files that are readable for the user with that user ID.
+Although all objects are implemented as capabilities in the Hurd, the
+Hurd also supports the use of user IDs for access control.
+
+The system authentication server \texttt{auth} implements the Unix
+authentication scheme using capabilities. It provides auth
+capabilities, which are associated with a list of effective and
+available user and group IDs. The holder of such a capability can use
+it to authenticate itself to other servers, using the protocol below.
+
+Of course, these other servers must use (and trust) the same
+\texttt{auth} server as the user. Otherwise, the authentication will
+fail. Once a capability is authenticated in the server, the server
+will know the user IDs of the client, and can use them to validate
+further operations.
+
+The \texttt{auth} server provides two types of capabilities:
+
+\subsubsection{Auth capabilities}
+An auth capability is associated with four vectors of IDs: The
+effective user and group IDs, which should be used by other servers to
+authenticate operations that require certain user or group IDs, and
+the available user and group IDs. Available IDs should not be used
+for authentication purposes, but can be turned into effective IDs by
+the holder of an auth capability at any time.
+
+New auth capabilities can be created from existing auth capabilities,
+but only if the requested IDs are a subsets from the union of the
+(effective and available) IDs in the provided auth capabilities. If
+an auth capability has an effective or available user ID 0, then
+arbitrary new auth objects can be created from that.
+
+\subsubsection{Passport capabilities}
+A passport capability can be created from an auth capability and is
+only valid for the task that created it. It can be provided to a
+server in an authentication process (see below). For the client, the
+passport capability does not directly implement any useful operation.
+For the server, it can be used to verify the identity of a user and
+read out the effective user and group IDs.
+
+The auth server should always create new passport objects for
+different tasks, even if the underlying auth object is the same, so
+that a task having the passport capability can not spy on other tasks
+unless they were given the passport capability by that task.
+
+\section{Authenticating a client to a server}
+
+A client can authenticate itself to a server with the following
+protocol:
+
+\subsubsection{Preconditions}
+The client $C$ has an auth capability implemented by the \texttt{auth}
+server $A$. It also has a capability implemented by the server $S$.
+It wants to reauthenticate this capability with the auth capability,
+so the server associates the new user and group IDs with it.
+
+The server also has an auth capability implemented by its trusted
+\texttt{auth} server. For the reauthentication to succeed, the
+\texttt{auth} server of the client and the server must be identical.
+If this is the case, the participating tasks hold task info caps for
+all other participating tasks (because of the capabilities they hold).
+
+\begin{enumerate}
+\item The client $C$ requests the passport capability for itself from
+ the auth capability from $A$.
+
+ \begin{comment}
+ Normally, the client will request the passport capability only
+ once and store it together with the auth capability.
+ \end{comment}
+
+\item The \texttt{auth} server receives the request and creates a new
+ passport capability for this auth capability and this client. The
+ passport capability is returned to the user.
+
+\item The user receives the reply from the \texttt{auth} server.
+
+ It then sends the reauthentication request to the server $S$, which
+ is invoked on the capability the client wants to reauthenticate. It
+ provides the passport capability as an argument.
+
+\item The server $S$ can accept the passport capability, if it
+ verifies that it is really implemented by the \texttt{auth} server
+ it trusts. If the client does not provide a passport capability to
+ the trusted \texttt{auth} server, the authentication process is
+ aborted with an error.
+
+ Now the server can send a request to the \texttt{auth} server to
+ validate the passport capability. The RPC is invoked on the
+ passport capability.
+
+\item The \texttt{auth} server receives the validation request on the
+ passport capability and returns the task ID of the client $C$ that
+ this passport belongs to, and the effective user and group IDs for
+ the auth cap to which this passport cap belongs.
+
+ \begin{comment}
+ The Hurd on Mach returned the available IDs as well. This feature
+ is not used anywhere in the Hurd, and as the available IDs should
+ not be used for authentication anyway, this does not seem to be
+ useful. If it is needed, it can be added in an extended version
+ of the validation RPC.
+ \end{comment}
+
+\item The server receives the task ID and the effective user and group
+ IDs. The server now verifies that the task ID is the same as the
+ task ID of the sender of the reauthentication request. Only then
+ was the reauthentication request made by the owner of the auth cap.
+ It can then return a new capability authenticated with the new user
+ and group IDs.
+
+ \begin{comment}
+ The verification of the client's task ID is necessary. As the
+ passport cap is copied to other tasks, it can not serve as a proof
+ of identity alone. It is of course absolutely crucial that the
+ server holds the task info cap for the client task $C$ for the
+ whole time of the protocol. But the same is actually true for any
+ RPC, as the server needs to be sure that the reply message is sent
+ to the sender thread (and not any imposter).
+ \end{comment}
+
+\item The client receives the reply with the new, reauthenticated
+ capability. Usually this capability is associated in the server
+ with the same abstract object, but different user credentials.
+
+ \begin{comment}
+ Of course a new capability must be created. Otherwise, all other
+ users holding the same capability would be affected as well.
+ \end{comment}
+
+ The client can now deallocate the passport cap.
+
+ \begin{comment}
+ As said before, normally the passport cap is cached by the client
+ for other reauthentications.
+ \end{comment}
+\end{enumerate}
+
+\subsubsection{Result}
+The client $C$ has a new capability that is authenticated with the new
+effective user and group IDs. The server has obtained the effective
+user and group IDs from the \texttt{auth} server it trusts.
+
+\begin{comment}
+ The Hurd on Mach uses a different protocol, which is more complex
+ and is vulnerable to DoS attacks. The above protocol can not
+ readily be used on Mach, because the sender task of a message can
+ not be easily identified.
+\end{comment}
+
+
diff --git a/doc/booting.tex b/doc/booting.tex
new file mode 100644
index 0000000..8d39fcd
--- /dev/null
+++ b/doc/booting.tex
@@ -0,0 +1,271 @@
+\chapter{Booting}
+
+A multiboot-compliant bootloader, for example GNU GRUB, loads the
+loader program \texttt{laden}, the kernel, $\sigma_0$, the rootserver
+and further modules. The loader is started, patches the kernel
+interface page, and starts the kernel. The kernel starts $\sigma_0$
+and the rootserver. The rootserver has to deal with the other
+modules.
+
+
+\section{System bootstrap}
+
+The initial part of the boot procedure is system specific.
+
+
+\subsection{Booting the ia32}
+
+On the ia32, the BIOS will be one of the first things to run.
+Eventually, the BIOS will start the bootloader. The Hurd requires a
+multiboot-compliant bootloader, such as GNU GRUB. A typical
+configuration file entry in the \verb/menu.list/ file of GNU GRUB will
+look like this:
+
+\begin{verbatim}
+title = The GNU Hurd on L4
+root = (hd0,0)
+kernel = /boot/laden
+module = /boot/ia32-kernel
+module = /boot/sigma0
+module = /boot/rootserver
+module = ...more servers...
+\end{verbatim}
+
+\begin{comment}
+ The name of the rootserver and the further modules are not specified
+ yet.
+\end{comment}
+
+GNU GRUB loads the binary image files into memory and jumps to the
+entry point of \texttt{laden}.
+
+
+\section{The loader \texttt{laden}}
+
+\texttt{laden} is a multiboot compliant kernel from the perspective of
+GNU GRUB. It expects at least three modules. The first module is the
+L4 kernel image, the second module is the $\sigma_0$ server image, and
+the third module is the rootserver image.
+
+\begin{comment}
+ Later, the L4 kernel will support the optional UTCB paging server
+ $\sigma_1$, which has to be treated like the other initial servers
+ by \texttt{laden}. A command line option to \texttt{laden} will
+ allow the user to specify if the third module is the rootserver or
+ $\sigma_1$. If $\sigma_1$ is used, the rootserver is the fourth
+ module in the list.
+\end{comment}
+
+\texttt{laden} copies (or moves) the three executable images to the
+right location in memory, according to their respective ELF headers.
+It also initializes the BSS section to zero.
+
+\begin{comment}
+ Laden has to deal with overlapping source and destination memory
+ areas in an intelligent way. It currently will detect such
+ situations, but is not always able to find a solution, even if one
+ exists.
+
+ If a memory area stretches out to the very last page addressible in
+ 32 bit, the high address of the memory descriptor will overflow.
+ This is in fact the behaviour of \texttt{kickstart}. \texttt{laden}
+ currently truncates such an area by one page. This needs
+ clarification in the L4 standard.
+\end{comment}
+
+Then it searches for the kernel interface page (KIP) in the L4 kernel
+image and modifies it in the following way:
+
+\begin{itemize}
+\item The memory descriptors are filled in according to the memory
+ layout of the system. On ia32, this information is -- at least
+ partially -- provided by GNU GRUB.
+
+ \begin{comment}
+ GNU GRUB seems to omit information about the memory that is shared
+ with the VGA card. \texttt{laden} creates a special entry for
+ that region, overriding any previous memory descriptor.
+ \end{comment}
+
+\item The start and end addresses and the entry point of the initial
+ servers are filled in.
+
+ \begin{comment}
+ A future version of L4 should support adding information about the
+ UTCB area of the initial rootserver as well. Until then, the
+ rootserver has no clean way to create a new thread (a hack is used
+ by the rootserver to calculate the UTCB addresses for other
+ threads).
+ \end{comment}
+
+\item The \verb/boot_info/ field is initialized.
+
+ \begin{comment}
+ The \verb/boot_info/ field is currently set to the GNU GRUB
+ \verb/multiboot_info/ structure. This only works for the ia32
+ architecture of course. We might want to have a more architecture
+ independent way to pass the information about further modules to
+ the rootserver. We also might want to gather the information
+ provided by GNU GRUB in a single page (if it is not).
+ \end{comment}
+\end{itemize}
+
+
+\section{The L4 kernel}
+
+The L4 kernel initializes itself and then creates the address spaces
+and threads for the initial servers $\sigma_0$ and the rootserver. It
+maps all physical memory idempotently into $\sigma_0$, and sets the
+pager of the rootserver thread to $\sigma_0$. Then it starts the
+initial servers.
+
+
+\section{The initial server $\sigma_0$}
+
+$\sigma_0$ acts as the pager for the rootserver, answering page fault
+messages by mapping the page at the fault address idempotently in the
+rootserver.
+
+\begin{comment}
+ $\sigma_0$ can also be used directly by sending messages to it,
+ according to the $\sigma_0$ RPC protocol. This is used by the kernel
+ to allocate reserved memory, but can also be used by the user to
+ explicitely allocate more memory than single pages indirectly via
+ page faults.
+\end{comment}
+
+The thread ID of $\sigma_0$ is (\verb/UserBase, 1)/.
+
+\begin{comment}
+ We will write all thread IDs in the form (\verb/thread nr/,
+ \verb/version/).
+\end{comment}
+
+Any fpage will only be provided to one thread. $\sigma_0$ will return
+an error if another thread attempts to map or manipulate an fpage that
+has already been given to some other thread, even if both threads
+reside in the same address space.
+
+
+\section{The initial server $\sigma_1$}
+
+$\sigma_1$ is intended to provide a paging service for UTCB memory.
+This will allow orthogonal persistence to be implemented. It is not
+yet supported.
+
+The thread ID of $\sigma_1$ is (\verb/UserBase + 1, 1)/.
+
+
+\section{The rootserver}
+\label{rootserver}
+
+The rootserver is the only task in the system which threads can
+perform privileged system calls. So the rootserver must provide
+wrappers for the system calls to other unprivileged system tasks.
+
+\begin{comment}
+ For this, a simple authentication scheme is required. The
+ rootserver can keep a small, statically allocated table of threads
+ which are granted access to the system call wrappers. The caller
+ could provide the index in the table for fast O(1) lookup instead
+ linear search. Threads with access could be allowed to add other
+ threads or change existing table entries. The same scheme can be
+ used in the device driver framework.
+
+ The rootserver should have one thread per CPU, and run at a high
+ priority.
+\end{comment}
+
+The rootserver has the following initial state:
+
+\begin{itemize}
+\item Its thread ID is (\verb/UserBase + 2/, 1).
+
+\item The priority is set to the 255, the maximum value.
+
+ \begin{comment}
+ The rootserver, or at least the system call wrapper, should run at
+ a very high priority.
+ \end{comment}
+
+\item The instruction pointer \verb/%eip/ is set to the entry point,
+all other registers are undefined (including the stack pointer).
+
+\item The pager is set to $\sigma_0$.
+
+\item The exception handler set to \verb/nilthread/.
+
+\item The scheduler is set to the rootserver thread itself.
+\end{itemize}
+
+So the first thing the rootserver has to do is to set up a simple
+stack.
+
+Then the rootserver should evaluate the \verb/boot_info/ field in the
+KIP to find the information about the other modules. It should parse
+the information and create the desired initial tasks of the operating
+system. The Hurd uses a boot script syntax to allow to pass
+information about other initial tasks and the root tasks to each
+initial task in a generalized manner.
+
+\begin{comment}
+ The exact number and type of initial tasks necessary to boot the
+ Hurd are not yet known. Chances are that this list includes the
+ \texttt{task} server, the physical memory server, the device
+ servers, and the boot filesystem. The boot filesystem might be a
+ small simple filesystem, which also includes the device drivers
+ needed to access the real root filesystem.
+\end{comment}
+
+
+\section{The physical memory server}
+
+To be written.
+
+\begin{comment}
+ In fact, I already have some ideas. Here they are:
+
+ The rootserver copies (or moves) the physical memory server
+ executable image to the right location in memory, according to its
+ respective ELF header. It also initializes the BSS section to zero.
+
+ Then it follows the \texttt{exec()} protocol to startup the new
+ task. This should be done as transparently as possible. All pages
+ the rootserver provides because of page faults should be granted.
+ The rootserver waits for the physical memory server to contact the
+ rootserver thread. Then the following startup protocol is walked
+ through:
+
+ \begin{enumerate}
+ \item The physical memory server requests all system memory from the
+ rootserver. The rootserver maps the memory from $\sigma_0$ and
+ grants it to the physical memory server. Alternatively, the
+ physical memory server might get the memory directly from
+ $\sigma_0$, but it should ask the rootserver for the amount and
+ location of memory to get.
+
+ \item For each module that has not been used yet, the rootserver
+ requests a capability in the physical memory server that can be
+ used to map in pages from the range of memory that the module
+ occupies. These capabilities should implement the same pager
+ interface that mappable files implement.
+
+ The idea is that these capabilities can be used in the
+ \texttt{exec()} protocol to start up the tasks for these modules.
+ If a module is not a task, the capability can be used to access
+ the module data by mapping it into the address space like a file.
+ The physical memory server can even swap out pages that back these
+ objects on memory pressure.
+
+ So, the physical memory server is in fact a simple filesystem for
+ these initial tasks, usable only for mapping operations.
+
+ \item The rootserver can then start up the other tasks in the module
+ list using the normal \texttt{exec()} protocol.
+ \end{enumerate}
+
+ The result is that all tasks except for the rootserver can be
+ started like normal Hurd tasks, and can also be swapped out.
+\end{comment}
+
+
diff --git a/doc/debugging.tex b/doc/debugging.tex
new file mode 100644
index 0000000..23f8230
--- /dev/null
+++ b/doc/debugging.tex
@@ -0,0 +1,10 @@
+\chapter{Debugging}
+\label{debug}
+
+L4 does not support debugging. So every task has to implement a debug
+interface and implement debugging locally. gdb needs to be changed to
+make use of this interface. How to perform the required
+authentication, and how the debug thread is advertised to gdb, and how
+the debug interface should look like, are all open questions.
+
+
diff --git a/doc/device-drivers.tex b/doc/device-drivers.tex
new file mode 100644
index 0000000..b7e3215
--- /dev/null
+++ b/doc/device-drivers.tex
@@ -0,0 +1,422 @@
+\chapter{Device Drivers}
+
+This section written by Peter De Schrijver and Daniel Wagner.
+
+\section{Requirements}
+
+ \begin{itemize}
+ \item Performance: Speed is important!
+ \item Portability: Framework should work on different architectures.
+
+ Also: Useable in a not hurdisch environment with only
+ small changes.
+
+ \item Flexibility
+ \item Convenient interfaces
+ \item Consistency
+ \item Safety: driver failure should have as minimal system impact as
+ possible.
+ \end{itemize}
+
+\section{Overview}
+
+ The framework consists of:
+ \begin{itemize}
+ \item Bus drivers
+ \item Device drivers
+ \item Service servers (plugin managers, $\omega_0$, rootserver)
+ \end{itemize}
+
+\subsection{Drivers and the filesystem}
+
+ The device driver framework will only offer a physical device view.
+ Ie. it will be a tree with devices as the leaves connected by
+ various bus technologies. Any logical view and naming persistence
+ will have to be build on top of this (translator).
+
+\subsection{Layer of the drivers}
+
+ The device driver framework consists only of the lower level drivers
+ and doesn't need to have a complicated scheme for access control.
+ This is because it should be possible to share devices, e.g. for
+ neighbour Hurd. The authentication is done by installing a virtual
+ driver in each OS/neighour Hurd. The driver framework trusts these
+ virtual drivers. So it's possible for a non Hurdish system to use
+ the driver framework just by implementing these virtual drivers.
+
+ Only threads which have registered as trusted are allowed to access
+ device drivers. The check is simply done by checking the senders
+ ID against a table of known threads.
+
+\subsection{Address spaces}
+
+ Drivers always reside in their own AS. The overhead for cross AS IPC
+ is small enough to do so.
+
+\subsection{Zero copying and DMA}
+
+ It is assumed that there are no differences between physical memory
+ pages. For example each physical memory page can be used for DMA
+ transfers. Of course, older hardware like ISA devices can so not be
+ supported. Who cares?
+
+ With this assumption, the device driver framework can be given any
+ physical memory page for DMA operation. This physical memory page
+ must be pinned down.
+
+ If an application wants to send or receive data to/from a device
+ driver it has to tell the virtual driver the page on which the
+ operation has to be executed. Since the application doesn't know
+ the virtual-real memory mapping, it has to ask the physical memory
+ manager for the real memory address of the page in question. If the
+ page is not directly mapped from the physical memory manager the
+ application ask the mapper (another application which has mapped
+ this memory region the first application) to resolve the mapping.
+ This can be done recursively. Normally, this resolving of mapping
+ can be speed up using a cache services, since a small number of
+ pages are reused very often.
+
+ With the scheme, the drivers do not have to take special care of
+ zero copying if there is only one virtual driver. When there is
+ more than one virtual driver pages have to copied for all other
+ virtual drivers.
+
+\subsection{Root bus driver}
+
+ The root bus is the entrypoint to look up devices.
+
+ XXX There should be iterators/visitors for operating on
+ busses/devices. (daniel)
+
+\subsection{Physical versus logical device view}
+
+ The device driver framework will only offer a physical device view.
+ Ie. it will be a tree with devices as the leaves connected by
+ various bus technologies. Any logical view and naming persistence
+ will have to be build on top of this (translator).
+
+\subsection{Things for the future}
+
+ \begin{itemize}
+ \item Interaction with the task server (e.g. listings driver threads
+ with ps,etc.)
+ \item Powermanagement
+ \end{itemize}
+
+\section{Bus Drivers}
+
+A bus driver is responsible to manage the bus and provide access to
+devices connected to it. In practice it means a bus driver has to
+perform the following tasks:
+
+\begin{itemize}
+\item Handle hotplug events
+
+ Busses which do not support hotplugging, will treated as if there is
+ 1 insertion event for every device connected to it when the bus
+ driver is started. Drivers which don't support autoprobing of
+ devices will probably have to read some configuration data from a
+ file or if the driver is a needed for bootstrapping configuration
+ can be given as argument on its stack. In some cases the bus
+ doesn't generate insertion/removal events, but can still support
+ some form of hotplug functionality if the user tells the driver when
+ a change to the bus configuration has happened (eg. SCSI).
+
+\item Configure client device drivers
+
+ The bus driver should start the appropriate client device driver
+ translator when an insertion event is detected. It should also
+ provide the client device driver with all necessary configuration
+ info, so it can access the device it needs. This configuration data
+ typically consists of the bus addresses of the device and possibly
+ IRQ numbers or DMA channel ID's. The device driver is loaded by the
+ assotiatet plugin manager.
+
+\item Provide access to devices
+
+ This means the bus driver should be able to perform a bus
+ transaction on behalf of a client device driver. In some cases this
+ involves sending a message and waiting for reply (eg. SCSI, USB,
+ IEEE 1394, Fibre Channel,...). The driver should provide
+ send/receive message primitives in this case. In other cases
+ devices on the bus can be accessed by doing a memory accesses or by
+ using special I/O instructions. In this case the driver should
+ provide mapping and unmapping primitives so a client device driver
+ can get access to the memory range or is allowed to access the I/O
+ addresses. The client device driver should use a library, which is
+ bus dependant, to access the device on the bus. This library hides
+ the platform specific details of accessing the bus.
+
+ Furthermore the bus driver must also support rescans for hardware.
+ It might be that not all drivers are found during bootstrapping and
+ hence later on drivers could be loaded. This is done by regenerate
+ new attach notification sending to bus's plugin manager. The plugin
+ manager loads then if possible a new driver. A probe funtion is not
+ needed since all supported hardware can be identified by
+ vendor/device identifactions (unlike ISA hardware). For hardware
+ busses which don't support such identifaction (ISA) only static
+ configuration is possible (configuration scripts etc.)
+\end{itemize}
+
+
+\subsection{Plugin Manager}
+
+ Each bus driver has a handle/reference to which insert/remove events
+ are send. The owner of the handle/refence must then take
+ appropriate action like loading the drivers. These actors are
+ called plugin managers.
+
+\subsection{Generic Bus Driver}
+
+ Operations:
+ \begin{itemize}
+ \item notify (attach, detach)
+ \item string enumerate
+ \end{itemize}
+
+ XXX Extract generic bus services from the PCI Bus Driver section
+ which could be also be used other PCI related busses (ISA) be used.
+ The name for this service is missleading, since a SCSI Bus Driver
+ does not have anything in common with a PCI bus. (daniel)
+
+\subsection{ISA Bus Driver}
+Inherits from:
+
+\begin{itemize}
+\item Generic Bus Driver
+\end{itemize}
+
+Operations:
+\begin{itemize}
+\item (none)
+\end{itemize}
+
+XXX The interface has not been defined up to now. (daniel)
+
+
+\subsection{PCI Bus Driver}
+
+Inherits from:
+\begin{itemize}
+\item Generic Bus Driver
+\end{itemize}
+
+Operations:
+\begin{itemize}
+\item map\_mmio: map a PCI BAR for MMIO
+\item map\_io: map a PCI BAR for I/O
+\item map\_mem: map a PCI BAR for memory
+\item read\_mmio\_{8,16,32,64}: read from a MMIO register
+\item write\_mmio\_{8,16,32,64}: write to a MMIO register
+\item read\_io\_{8,16,32,64}: read from an IO register
+\item write\_io\_{8,16,32,64}: write to an IO register
+\item read\_config\_{8,16,32,?}: read from a PCI config register
+\item write\_config\_{8,16,32,?}: write to a PCI config register
+\item alloc\_dma\_mem(for non zero copying): allocate main memory useable for DMA
+\item free\_dma\_mem (for non zero copying): free main memory useable for DMA
+\item prepare\_dma\_read: write back CPU cachelines for DMAable memory area
+\item sync\_dma\_write: discard CPU cachelines for DMAable memory area
+\item alloc\_consistent\_mem: allocate memory which is consistent between CPU
+ and device
+\item free\_consistent\_mem: free memory which
+ is consistent between CPU and device
+\item get\_irq\_mapping (A,B,C,D): get the IRQ matching the INT(A,B,C,D) line
+\end{itemize}
+
+\section{Device Drivers}
+\subsection{Classes}
+\begin{itemize}
+\item character: This the standard tty as known in the Unix environment.
+\item block
+\item human input: Keyboard, mouse, ...
+\item packet switched network
+\item circuit switched network
+\item framebuffer
+\item streaming audio
+\item streaming video
+\item solid state storage: flash memory
+\end{itemize}
+
+\subsection{Human input devices (HID) and the console}
+
+The HIDs and the console are critical for user interaction with the
+system. Furthmore, the console should be working as soons as possible
+to give feedback. Log messages which are send to the console before
+the hardware has been initialized should be buffered.
+
+\subsection{Generic Device Driver}
+Operations:
+\begin{itemize}
+\item init : prepare hardware for use
+\item start : start normal operation
+\item stop : stop normal operation
+\item deinit : shutdown hardware
+\item change\_irq\_peer : change peer thread to propagate irq message to.
+\end{itemize}
+
+
+\subsection{ISA Devices}
+Inherits from:
+\begin{itemize}
+\item Generic Device Driver
+\end{itemize}
+
+Supported devices
+\begin{itemize}
+\item Keyboard (ps2)
+\item serial port (mainly for debugging purposses)
+\item parallel port
+\end{itemize}
+
+XXX interface definition for each device driver is missing. (daniel)
+
+
+\subsection{PCI Devices}
+Inherits from:
+\begin{itemize}
+\item Generic Device Driver
+\end{itemize}
+
+Supported devices:
+\begin{itemize}
+\item block devices
+\item ...
+\end{itemize}
+
+XXX interface definition for each device driver is missing. (daniel)
+
+
+\section{Resource Management}
+
+
+\subsection{IRQ handling}
+
+\subsubsection{IRQ based interrupt vectors}
+
+Some CPU architectures (eg 68k, IA32) can directly jump to an
+interrupt vector depending on the IRQ number. This is typically the
+case on CISC CPU's. In this case there is some priorization scheme. On
+IA32 for example, the lowest IRQ number has the highest priority.
+Sometimes the priorities are programmable. Most RISC CPU's have only
+a few interrupt vectors which are connected external IRQs. (typically
+1 or 2). This means the IRQ handler should read a register in the
+interrupt controller to determine which IRQ handler has to be
+executed. Sometimes the hardware assists here by providing a register
+which indicates the highest priority interrupt according to some
+(programmable) scheme.
+
+\subsubsection{IRQ acknowlegdement}
+
+The IRQ acknowledgement is done in two steps. First inform the
+hardware about the successful IRQ acceptance. Then inform the ISRs
+about the IRQ event.
+
+\subsubsection{Edge versus level triggered IRQs}
+
+Edge triggered IRQs typically don't need explicit acknowledgment by
+the CPU at the device level. You can just acknowledge them at the
+interrupt controller level. Level triggered IRQs typically need to
+explicitly acknowledged by the CPU at the device level. The CPU has to
+read or write a register from the IRQ generating peripheral to make
+the IRQ go away. If this is not done, the IRQ handler will be
+reentered immediatly after it ended, effectively creating an endless
+loop. Another way of preventing this would be to mask the IRQ.
+
+\subsubsection{Multiple interrupt controllers}
+
+Some systems have multiple interrupt controllers in cascade. This is
+for example the case on a PC, where you have 2 8259 interrupt
+controllers. The second controller is connected to the IRQ 2 pin of
+the first controller. It is also common in non PC systems which still
+use some standard PC components such as a Super IO controller. In this
+case the 2 8259's are connected to 1 pin of the primary interrupt
+controller. Important for the software here is that you need to
+acknowledge IRQ's at each controller. So to acknowledge an IRQ from
+the second 8259 connected to the first 8259 connected to another
+interrupt controller, you have to give an ACK command to each of those
+controllers. Another import fact is that on PC architecture the order
+of the ACKs is important.
+
+\subsubsection{Shared IRQs}
+
+Some systems have shared IRQs. In this case the IRQ handler has to
+look at all devices using the same IRQ...
+
+\subsubsection{IRQ priorities}
+
+All IRQs on L4 have priorities, so if an IRQ occurs any IRQ lower then
+the first IRQ will be blocked until the first IRQ has been
+acknowlegded. ISR priorities must much the hardware priority (danger
+of priority inversion). Furthermore the IRQ acknowledgment order is
+important.
+
+The 8259 also supports a specific IRQ acknowledge iirc. But, this
+scheme does not work in most level triggered IRQ environments. In
+these environments you must acknowledge (or mask) the IRQ before
+leaving the IRQ handler, otherwise the CPU will immediately reenter
+the IRQ handler, effectively creating an endless loop. In this case L4
+would have to mask the IRQ. The IRQ thread would have to unmask it
+after acknowledgement and processing.
+
+\subsubsection{IRQ handling by L4/x86}
+
+The L4 kernel does handle IRQ acknowlegdment.
+
+
+\subsection{$\omega_0$}
+
+$\omega_0$ is a system-central IRQ-logic server. It runs in the
+privileged AS space in order to be allowed rerouting IRQ IPC.
+
+If an IRQ is shared between several devices, the drivers are daisy
+chained and have to notify their peers if an IRQ IPC has arrived.
+
+XXX For more detail see XXX URL missing
+
+Operations:
+\begin{itemize}
+\item attach\_irq : attach an ISR thread to the IRQ
+\item detach\_irq : detach an ISR thread form the IRQ
+\end{itemize}
+
+
+\subsection{Memory}
+If no physical memory pages are provided by the OS the device driver
+framework alloces pages from the physical memory manager. The device
+driver framework has at no point of time to handle any virtual to
+physical page mapping.
+
+
+\section{Bootstrapping}
+
+A simpleFS provides initial drivers for bootstraping. The root bus
+driver and simpleFS is loaded by grub as module. It then signals for
+loading new (bus) drivers. As before if there is no driver avaible
+for some reason for the device, the bus driver doesn't change the
+device state and waits for a notifaction that there are new drivers
+avaible. This simpleFS might be based on BSD libstand (library for
+standalone applications). simpleFS doesn't need to be writeable
+either.
+
+
+\subsection{Plugin Manager}
+A Plugin manager handles driver loading for devices. It searches for
+driver in seach pathes (on filesystems). It's possible to add new
+search pathes later. This allows the system to bootstrap with only
+one search path (the simpleFS). When the search path is changed, the
+device tree will be scanned for devices which don't have a driver
+loaded yet. If a driver has become available, it will be loaded.
+
+
+\section{Order of implementation}
+
+\begin{enumerate}
+\item rootserver, plugin server
+\item root bus server
+\item pci bus
+\item isa bus
+\item serial port (isa bus)
+\item console
+\end{enumerate}
+
+
diff --git a/doc/hurd-on-l4.tex b/doc/hurd-on-l4.tex
index b48f0e9..8d8ffaa 100644
--- a/doc/hurd-on-l4.tex
+++ b/doc/hurd-on-l4.tex
@@ -1,4 +1,4 @@
-\documentclass[9pt,a4paper]{extarticle}
+\documentclass{book}
%\usepackage{german}
%\usepackage[margin=2.5cm]{geometry}
@@ -9,2706 +9,20 @@
\date{August 2003}
\begin{document}
+
\maketitle
-\newpage
\tableofcontents
-\newpage
\setlength{\parindent}{0pt}
\setlength{\parskip}{1ex plus 0.5ex minus 0.2ex}
-\section{Introduction}
-
-The GNU Hurd is a multi-server operating system running on top of a
-microkernel (currently Mach variants). The core motivation of the
-Hurd is the following:
-
-\begin{quote}
- \emph{The operating system should enable its users to share the
- resources of the system without harming each other.}
-\end{quote}
-
-The focus is on the user, the system should try to allow the user to
-do anything that is not harmful for other users. Many operating
-systems either restrict what the user can do to be more secure, while
-others allow the user to do everything, but fail on protecting the
-users from each other effectively.
-
-The Hurd is designed to minimize the system code that the user is
-required to use, while allowing the user to use, ignore or replace the
-remaining system code, and this without harming other users.
-
-So while the L4 microkernel tries to minimize the policy that the
-kernel enforces on the software running on it, the Hurd tries to
-minimize the policy that the operating system enforces on its users.
-Furthermore, the Hurd also aims to provide a POSIX compatible general
-purpose operating system. However, this POSIX personality of the Hurd
-is provided for convenience only, and to make the Hurd useful. Other
-personalities can be implemented and used by the users of the system
-along with the POSIX personality. This default personality of the
-Hurd also provides some convenient features that allow the user to
-extend the system so that all POSIX compatible programs can take
-advantage of it.
-
-These notes are a moving target in the effort to find the best
-strategy to port the Hurd to the L4 microkernel.
-
-\begin{comment}
- Remarks about the history of a certain feature and implementation
- details are set in a smaller font and separated from the main text,
- just like this paragraph. Because this is work in progress, there
- are naturally a lot of such comments.
-\end{comment}
-
-
-\section{Booting}
-
-A multiboot-compliant bootloader, for example GNU GRUB, loads the
-loader program \texttt{laden}, the kernel, $\sigma_0$, the rootserver
-and further modules. The loader is started, patches the kernel
-interface page, and starts the kernel. The kernel starts $\sigma_0$
-and the rootserver. The rootserver has to deal with the other
-modules.
-
-
-\subsection{System bootstrap}
-
-The initial part of the boot procedure is system specific.
-
-
-\subsubsection{Booting the ia32}
-
-On the ia32, the BIOS will be one of the first things to run.
-Eventually, the BIOS will start the bootloader. The Hurd requires a
-multiboot-compliant bootloader, such as GNU GRUB. A typical
-configuration file entry in the \verb/menu.list/ file of GNU GRUB will
-look like this:
-
-\begin{verbatim}
-title = The GNU Hurd on L4
-root = (hd0,0)
-kernel = /boot/laden
-module = /boot/ia32-kernel
-module = /boot/sigma0
-module = /boot/rootserver
-module = ...more servers...
-\end{verbatim}
-
-\begin{comment}
- The name of the rootserver and the further modules are not specified
- yet.
-\end{comment}
-
-GNU GRUB loads the binary image files into memory and jumps to the
-entry point of \texttt{laden}.
-
-
-\subsection{The loader \texttt{laden}}
-
-\texttt{laden} is a multiboot compliant kernel from the perspective of
-GNU GRUB. It expects at least three modules. The first module is the
-L4 kernel image, the second module is the $\sigma_0$ server image, and
-the third module is the rootserver image.
-
-\begin{comment}
- Later, the L4 kernel will support the optional UTCB paging server
- $\sigma_1$, which has to be treated like the other initial servers
- by \texttt{laden}. A command line option to \texttt{laden} will
- allow the user to specify if the third module is the rootserver or
- $\sigma_1$. If $\sigma_1$ is used, the rootserver is the fourth
- module in the list.
-\end{comment}
-
-\texttt{laden} copies (or moves) the three executable images to the
-right location in memory, according to their respective ELF headers.
-It also initializes the BSS section to zero.
-
-\begin{comment}
- Laden has to deal with overlapping source and destination memory
- areas in an intelligent way. It currently will detect such
- situations, but is not always able to find a solution, even if one
- exists.
-
- If a memory area stretches out to the very last page addressible in
- 32 bit, the high address of the memory descriptor will overflow.
- This is in fact the behaviour of \texttt{kickstart}. \texttt{laden}
- currently truncates such an area by one page. This needs
- clarification in the L4 standard.
-\end{comment}
-
-Then it searches for the kernel interface page (KIP) in the L4 kernel
-image and modifies it in the following way:
-
-\begin{itemize}
-\item The memory descriptors are filled in according to the memory
- layout of the system. On ia32, this information is -- at least
- partially -- provided by GNU GRUB.
-
- \begin{comment}
- GNU GRUB seems to omit information about the memory that is shared
- with the VGA card. \texttt{laden} creates a special entry for
- that region, overriding any previous memory descriptor.
- \end{comment}
-
-\item The start and end addresses and the entry point of the initial
- servers are filled in.
-
- \begin{comment}
- A future version of L4 should support adding information about the
- UTCB area of the initial rootserver as well. Until then, the
- rootserver has no clean way to create a new thread (a hack is used
- by the rootserver to calculate the UTCB addresses for other
- threads).
- \end{comment}
-
-\item The \verb/boot_info/ field is initialized.
-
- \begin{comment}
- The \verb/boot_info/ field is currently set to the GNU GRUB
- \verb/multiboot_info/ structure. This only works for the ia32
- architecture of course. We might want to have a more architecture
- independent way to pass the information about further modules to
- the rootserver. We also might want to gather the information
- provided by GNU GRUB in a single page (if it is not).
- \end{comment}
-\end{itemize}
-
-
-\subsection{The L4 kernel}
-
-The L4 kernel initializes itself and then creates the address spaces
-and threads for the initial servers $\sigma_0$ and the rootserver. It
-maps all physical memory idempotently into $\sigma_0$, and sets the
-pager of the rootserver thread to $\sigma_0$. Then it starts the
-initial servers.
-
-
-\subsection{The initial server $\sigma_0$}
-
-$\sigma_0$ acts as the pager for the rootserver, answering page fault
-messages by mapping the page at the fault address idempotently in the
-rootserver.
-
-\begin{comment}
- $\sigma_0$ can also be used directly by sending messages to it,
- according to the $\sigma_0$ RPC protocol. This is used by the kernel
- to allocate reserved memory, but can also be used by the user to
- explicitely allocate more memory than single pages indirectly via
- page faults.
-\end{comment}
-
-The thread ID of $\sigma_0$ is (\verb/UserBase, 1)/.
-
-\begin{comment}
- We will write all thread IDs in the form (\verb/thread nr/,
- \verb/version/).
-\end{comment}
-
-Any fpage will only be provided to one thread. $\sigma_0$ will return
-an error if another thread attempts to map or manipulate an fpage that
-has already been given to some other thread, even if both threads
-reside in the same address space.
-
-
-\subsection{The initial server $\sigma_1$}
-
-$\sigma_1$ is intended to provide a paging service for UTCB memory.
-This will allow orthogonal persistence to be implemented. It is not
-yet supported.
-
-The thread ID of $\sigma_1$ is (\verb/UserBase + 1, 1)/.
-
-
-\subsection{The rootserver}
-\label{rootserver}
-
-The rootserver is the only task in the system which threads can
-perform privileged system calls. So the rootserver must provide
-wrappers for the system calls to other unprivileged system tasks.
-
-\begin{comment}
- For this, a simple authentication scheme is required. The
- rootserver can keep a small, statically allocated table of threads
- which are granted access to the system call wrappers. The caller
- could provide the index in the table for fast O(1) lookup instead
- linear search. Threads with access could be allowed to add other
- threads or change existing table entries. The same scheme can be
- used in the device driver framework.
-
- The rootserver should have one thread per CPU, and run at a high
- priority.
-\end{comment}
-
-The rootserver has the following initial state:
-
-\begin{itemize}
-\item Its thread ID is (\verb/UserBase + 2/, 1).
-
-\item The priority is set to the 255, the maximum value.
-
- \begin{comment}
- The rootserver, or at least the system call wrapper, should run at
- a very high priority.
- \end{comment}
-
-\item The instruction pointer \verb/%eip/ is set to the entry point,
-all other registers are undefined (including the stack pointer).
-
-\item The pager is set to $\sigma_0$.
-
-\item The exception handler set to \verb/nilthread/.
-
-\item The scheduler is set to the rootserver thread itself.
-\end{itemize}
-
-So the first thing the rootserver has to do is to set up a simple
-stack.
-
-Then the rootserver should evaluate the \verb/boot_info/ field in the
-KIP to find the information about the other modules. It should parse
-the information and create the desired initial tasks of the operating
-system. The Hurd uses a boot script syntax to allow to pass
-information about other initial tasks and the root tasks to each
-initial task in a generalized manner.
-
-\begin{comment}
- The exact number and type of initial tasks necessary to boot the
- Hurd are not yet known. Chances are that this list includes the
- \texttt{task} server, the physical memory server, the device
- servers, and the boot filesystem. The boot filesystem might be a
- small simple filesystem, which also includes the device drivers
- needed to access the real root filesystem.
-\end{comment}
-
-
-\subsection{The physical memory server}
-
-To be written.
-
-\begin{comment}
- In fact, I already have some ideas. Here they are:
-
- The rootserver copies (or moves) the physical memory server
- executable image to the right location in memory, according to its
- respective ELF header. It also initializes the BSS section to zero.
-
- Then it follows the \texttt{exec()} protocol to startup the new
- task. This should be done as transparently as possible. All pages
- the rootserver provides because of page faults should be granted.
- The rootserver waits for the physical memory server to contact the
- rootserver thread. Then the following startup protocol is walked
- through:
-
- \begin{enumerate}
- \item The physical memory server requests all system memory from the
- rootserver. The rootserver maps the memory from $\sigma_0$ and
- grants it to the physical memory server. Alternatively, the
- physical memory server might get the memory directly from
- $\sigma_0$, but it should ask the rootserver for the amount and
- location of memory to get.
-
- \item For each module that has not been used yet, the rootserver
- requests a capability in the physical memory server that can be
- used to map in pages from the range of memory that the module
- occupies. These capabilities should implement the same pager
- interface that mappable files implement.
-
- The idea is that these capabilities can be used in the
- \texttt{exec()} protocol to start up the tasks for these modules.
- If a module is not a task, the capability can be used to access
- the module data by mapping it into the address space like a file.
- The physical memory server can even swap out pages that back these
- objects on memory pressure.
-
- So, the physical memory server is in fact a simple filesystem for
- these initial tasks, usable only for mapping operations.
-
- \item The rootserver can then start up the other tasks in the module
- list using the normal \texttt{exec()} protocol.
- \end{enumerate}
-
- The result is that all tasks except for the rootserver can be
- started like normal Hurd tasks, and can also be swapped out.
-\end{comment}
-
-
-\section{Inter-process communication (IPC)}
-\label{ipc}
-
-The Hurd requires a capability system. Capabilities are used to proof
-your identity to other servers (authentication), and access
-server-side implemented objects like devices, files, directories,
-terminals, and other things. The server can use a capability for
-whatever it wants. Capabilities provide interfaces. Interfaces can
-be invoked by sending messages to the capability. In L4, this means
-that a message is sent to a thread in the server providing the
-capability, with the identifier for the capability in the message.
-
-Capabilities are protected objects. Access to a capability needs to
-be granted by the server. Once you have a capability, you can copy it
-to other tasks (if the server permits it, which is usually the case).
-In the Hurd, access to capabilities is always granted to a whole task,
-not to individual threads.
-
-\begin{comment}
- There is no reason for the server not to permit it, because the
- holder of the capability could also just act as a proxy for the
- intended receiver instead copying the capability to it. The
- operation might fail anyway, for example because of resource
- shortage, in particular if the server puts a quota on the number of
- capabilities a user can hold.
-\end{comment}
-
-Capabilities provide two essential services to the Hurd. They are
-used to restrict access to a server function, and they are the
-standard interface the components in the Hurd use to communicate with
-each others. Thus, it is important that their implementation is fast
-and secure.
-
-\begin{comment}
- There are several ways to implement such a capability system. A
- more traditional design would be a global, trusted capability server
- that provides capabilities to all its users. The L4 redirector
- could be used to reroute all client traffic automatically through
- this server. This approach has several disadvantages:
-
- \begin{itemize}
- \item It adds a lot of overhead to every single RPC, because all
- traffic has to be routed through the capability server, which must
- then perform the authentication on the server's behalf.
- \item It would be difficult to copy a capability to another task.
- Either the cap server would have to provide interfaces for clients
- to do it, or it would be have to know the message format for every
- interface and do it automatically.
- \item It would be a single point of failure. If it had a bug and
- crashed, the whole system would be affected.
- \item Users could not avoid it, it would be enforced system code.
- \item It is inflexible. It would be hard to replace or extend at
- run-time.
- \end{itemize}
-
- Another approach is taken by CORBA with IORs. IORs contain long
- random numbers which allow the server to identify a user of an
- object. This approach is not feasible for the following reasons:
-
- \begin{itemize}
- \item Even good random numbers can be guessed. Long enough random
- numbers can reduce the likelihood to arbitrary small numbers,
- though (below the probability of a hardware failure).
- \item Good random numbers are in short supply, and is slow to
- generate. Good pseudo random is faster, but it is still difficult
- to generate. The random number generator would become a critical
- part of the operating system.
- \item The random number had to be transfered in every single
- message. Because it would have to be long, it would have a
- significant negative impact on IPC performance.
- \end{itemize}
-\end{comment}
-
-The Hurd implements the capability system locally in each task. A
-common default implementation will be shared by all programs.
-However, a malicious untrusted program can do nothing to disturb the
-communication of other tasks. A capability is identified in the
-client task by the server thread and a local identifier (which can be
-different from client to client). The server thread will receive
-messages for the capabilities. The first argument in the message is
-the capability identifier. Although every task can get different IDs
-for the same capability, a well-behaving server will give the same ID
-to a client which already has a capability and gets the same
-capability from another client. So clients can compare capability IDs
-from the server numerically to check if two capabilities are the same,
-but only if one of the two IDs is received while the client already
-had the other one.
-
-Because access to a capability must be restricted, the server needs to
-be careful in only allowing registered and known users to access the
-capability. For this, the server must be sure that it can determine
-the sender of a message. In L4, this is easy on the surface: The
-kernel provides the receiving thread with the sender's thread ID,
-which also contains the task ID in the version field. However, the
-server must also know for sure if this task is the same task that it
-gave access to the capability. Comparing the task IDs numerically is
-not good enough, the server must also somehow have knowledge or
-influence on how task IDs are reused when tasks die and are created.
-
-The same is true for the client, of course, which trusts the server
-and thus must be sure that it is not tricked into trusting on
-unreliable data from an imposter, or sends sensitive data to it.
-
-\begin{comment}
- The \texttt{task} server wants to reuse thread numbers because that
- makes best use of kernel memory. Reusing task IDs, the version
- field of a thread ID, is not so important, but there are only 14
- bits for the version field (and the lower six bits must not be all
- zero). So a thread ID is bound to be reused eventually.
-
- Using the version field in a thread ID as a generation number is not
- good enough, because it is so small. Even on 64-bit architectures,
- where it is 32 bit long, it can eventually overflow.
-\end{comment}
-
-The best way to prevent that a task can be tricked into talking to an
-imposter is to have the \texttt{task} server notify the task if the
-communication partner dies. The \texttt{task} server must guarantee
-that the task ID is not reused until all tasks that got such a
-notification acknowledge that it is processed, and thus no danger of
-confusion exists anymore.
-
-The \texttt{task} server provides references to task IDs in form of
-\emph{task info capabilities}. If a task has a task info capability
-for another task, it prevents that this other task's task ID is reused
-even if that task dies, and it also makes sure that task death
-notifications are delivered in that case.
-
-\begin{comment}
- Because only the \texttt{task} server can create and destroy tasks,
- and assign task IDs, there is no need to hold such task info
- capabilities for the \texttt{task} server, nor does the
- \texttt{task} server need to hold task info capabilities for its
- clients. This avoids the obvious bootstrap problem in providing
- capabilities in the \texttt{task} server. This will even work if
- the \texttt{task} server is not the real \texttt{task} server, but a
- proxy task server (see section \ref{proxytaskserver} on page
- \pageref{proxytaskserver}).
-\end{comment}
-
-As task IDs are a global resource, care has to be taken that this
-approach does not allow for a DoS-attack by exhausting the task ID
-number space, see section \ref{taskinfocap} on page
-\pageref{taskinfocap} for more details.
-
-
-\subsection{Capabilities}
-
-This subsection contains implementation details about capabilities.
-
-A server will usually operate on objects, and not capabilities. In
-the case of a filesystem, this could be file objects, for example.
-
-\begin{comment}
- In the Hurd, filesystem servers have to keep different objects for
- each time a file is looked up (or ``opened''), because some state,
- for example authentication, open flags and record locks, are
- associated not with the file directly, but with this instance of
- opening the file. Such a state structure (``credential'') will also
- contain a pointer and reference to the actual file node. For
- simplicity, we will assume that the capability is associated with a
- file node directly.
-\end{comment}
-
-To provide access to the object to another task, the server creates a
-capability, and associates it with the object (by setting a hook
-variable in the capability). From this capability, the server can
-either create send references to itself, or to other tasks. If the
-server creates send references for itself, it can use the capability
-just as it can use capabilities implemented by other servers. This
-makes access to locally and remotely implemented capabilities
-identical. If you write code to work on capabilities, it can be used
-for remote objects as well as for local objects.
-
-If the server creates a send reference for another task (a client), a
-new capability ID will be created for this task. This ID will only be
-valid for this task, and should be returned to the client.
-
-The client itself will create a capability object from this capability
-ID. The capability will also contain information about the server,
-for example the server thread which should be used for sending
-messages to the capability.
-
-If the client wants to send a message, it will send it to the provided
-server thread, and use the capability ID it got from the server as the
-first argument in the RPC. The server receives the message, and now
-has to look up the capability ID in the list of capabilties for this
-task.
-
-\begin{comment}
- The server knows the task ID from the version field of the sender's
- thread ID. It can look up the list of capabilities for this task in
- a hash table. The capability ID can be an index into an array, so
- the server only needs to perform a range check. This allows to
- verify quickly that the user is allowed to access the object.
-
- This is not enough if several systems run in parallel on the same
- host. Then the version ID for the threads in the other systems will
- not be under the control of the Hurd's \texttt{task} server, and can
- thus not be trusted. The server can still use the version field to
- find out the task ID, which will be correct \emph{if the thread is
- part of the same subsystem}. It also has to verify that the
- thread belongs to this subsystem. Hopefully the subsystem will be
- encoded in the thread ID. Otherwise, the \texttt{task} server has
- to be consulted (and, assuming that thread numbers are not shared by
- the different systems, the result can be cached).
-\end{comment}
-
-The server reads out the capability associated with the capability ID,
-and invokes the server stub according to the message ID field in the
-message.
-
-After the message is processed, the server sends it reply to the
-sender thread with a zero timeout.
-
-\begin{comment}
- Servers must never block on sending messages to clients. Even a
- small timeout can be used for DoS-attacks. The client can always
- make sure that it receives the reply by using a combined send and
- receive operation together with an infinite timeout.
-\end{comment}
-
-The above scheme assumes that the server and the client already have
-task info caps for the respective other task. This is the normal
-case, because acquiring these task info caps is part of the protocol
-that is used when a capability is copied from one task to another.
-
-
-\subsubsection{Bootstrapping a client-server connection}
-\label{ipcbootstrap}
-
-If the client and the server do not know about each other yet, then
-they can bootstrap a connection without support from any other task
-except the \texttt{task} server. The purpose of the initial handshake
-is to give both participants a chance to acquire a task info cap for
-the other participants task ID, so they can be sure that from there on
-they will always talk to the same task as they talked to before.
-
-\paragraph{Preconditions}
-The client knows the thread ID of the server thread that receives and
-processes the bootstrap messages. Some other task might hold a task
-info capability to the server the client wants to connect to.
-
-\begin{comment}
- If no such other tasks exists, the protocol will still work.
- However, the client might not get a connection to the server that
- run at the time the client started the protocol, but rather to the
- server that run at the time the client acquired the task info cap
- for the server's task ID (after step 1 below).
-
- This is similar to how sending signals works in Unix: Technically,
- at the time you write \texttt{kill 203}, and press enter, you do not
- know if the process with the PID 203 you thought of will receive the
- signal, or some other process that got the PID in the time between
- you getting the information about the PID and writing the
- \texttt{kill}-command.
-\end{comment}
-
-FIXME: Here should be the pseudo code for the protocol. For now, you
-have to take it out of the long version.
-
-\begin{enumerate}
-
-\item The client acquires a task info capability for the server's task
- ID, either directly from the \texttt{task} server, or from another
- task in a capability copy. From that point on, the client can be
- sure to always talk to the same task when talking to the server.
-
- Of course, if the client already has a task info cap for the server
- it does not need to do anything in this step.
-
-\begin{comment}
- As explained above, if the client does not have any other task
- holding the task info cap already, it has no secure information
- about what this task is for which it got a task info cap.
-\end{comment}
-
-\item The client sends a message to the server, requesting the initial
- handshake.
-
-\item The server receives the message, and acquires a task info cap
- for the client task (directly from the \texttt{task} server).
-
- Of course, if the server already has a task info cap for the client
- it does not need to do anything in this step.
-
-\begin{comment}
- At this point, the server knows that future messages from this task
- will come from the same task as it got the task info cap for.
- However, it does not know that this is the same task that sent the
- initial handshake request in step 2 above. This shows that there is
- no sense in verifying the task ID or perform any other
- authentication before acquiring the task info cap.
-\end{comment}
-
-\item The server replies to the initial handshake request with an
- empty reply message.
-
-\begin{comment}
- Because the reply now can go to a different task than the request
- came from, sending the reply might fail. It might also succeed and
- be accepted by the task that replaced the requestor. Or it might
- succeed normally. The important thing is that it does not matter to
- the server at all. It would have provided the same ``service'' to
- the ``imposter'' of the client, if he had bothered to do the
- request. As no authentication is done yet, there is no point for
- the server to bother.
-
- This means however, that the server needs to be careful in not
- consuming too many resources for this service. However, this is
- easy to achieve. Only one task info cap per client task will ever
- be held in the server. The server can either keep it around until
- the task dies (and a task death notification is received), or it can
- clean it up after some timeout if the client does not follow up and
- do some real authentication.
-\end{comment}
-
-\item The client receives the reply message to its initial handshake
- request.
-
-\item The client sends a request to create its initial capability.
- How this request looks depends on the type of the server and the
- initial capabilities it provides. Here are some examples:
-
- \begin{itemize}
- \item A filesystem might provide an unauthenticated root directory
- object in return of the underlying node capability, which is
- provided by the parent filesystem and proves to the filesystem
- that the user was allowed to look up the root node of this
- filesystem (see section \ref{xfslookup} on page
- \pageref{xfslookup}).
-
- \begin{comment}
- In this example, the parent filesystem will either provide the
- task info cap for the child filesystem to the user, or it will
- hold the task info cap while the user is creating their own
- (which the user has to verify by repeating the lookup, though).
- Again, see section \ref{xfslookup} on page \pageref{xfslookup}.
-
- The unauthenticated root directory object will then have the be
- authenticated using the normal reauthentication mechanism (see
- section \ref{auth} on pageref{auth}). This can also be combined
- in a single RPC.
- \end{comment}
-
- \item Every process acts as a server that implements the signal
- capability for this process. Tasks who want to send a signal to
- another task can perform the above handshake, and then provide
- some type of authentication capability that indicates that they
- are allowed to send a signal. Different authentication
- capabilities can be accepted by the signalled task for different
- types of signals.
-
- \begin{comment}
- The Hurd used to store the signal capability in the proc server,
- where authorized tasks could look it up. This is no longer
- possible because a server can not accept capabilities
- implemented by untrusted tasks, see below.
- \end{comment}
- \end{itemize}
-
-\item The server replies with whatever capability the client
- requested, provided that the client could provide the necessary
- authentication capabilities, if any.
-
- \begin{comment}
- It is not required that the server performs any authentication at
- all, but it is recommended, and all Hurd servers will do so.
-
- In particular, the server should normally only allow access from
- tasks running in the same system, if running multiple systems on
- the same host is possible.
- \end{comment}
-\end{enumerate}
-
-\paragraph{Result}
-The client has a task info capability for the server and an
-authenticated capability. The server has a task info capability for
-the client and seen some sort of authentication for the capability it
-gave to the client.
-
-\begin{comment}
- If you think that the above protocol is complex, you have seen
- nothing yet! Read on.
-\end{comment}
-
-
-\subsubsection{Returning a capability from a server to a client}
-
-Before we go on to the more complex case of copying a capability from
-one client to another, let us point out that once a client has a
-capability from a server, it is easy for the server to return more
-capabilities it implements to the client.
-
-The server just needs to create the capability, acquire a capability
-ID in the client's cap ID space, and return the information in the
-reply RPC.
-
-FIXME: Here should be the pseudo code for the protocol. For now, you
-have to take it out of the long version.
-
-\begin{comment}
- The main point of this section is to point out that only one task
- info capability is required to protect all capabilities provided to
- a single task. The protocols described here always assume that no
- task info caps are held by anyone (except those mentioned in the
- preconditions). In reality, sometimes the required task info caps
- will already be held.
-\end{comment}
-
-
-\subsubsection{Copying a capability from one client to another task}
-
-The most complex operation in managing capabilities is to copy or move
-a capability from the client to another task, which subsequently
-becomes a client of the server providing the capability. The
-difficulty here lies in the fact that the protocol should be fast, but
-also robust and secure. If any of the participants dies unexpectedly,
-or any of the untrusted participants is malicious, the others should
-not be harmed.
-
-\paragraph{Preconditions}
-The client $C$ has a capability from server $S$ (this implies that $C$
-has a task info cap for $S$ and $S$ has a task info cap for $C$). It
-wants to copy the capability to the destination task $D$. For this,
-it will have to make RPCs to $D$, so $C$ has also a capability from
-$D$ (this implies that $C$ has a task info cap for $D$ and $D$ has a
-task info cap for $C$). Of course, the client $C$ trusts its servers
-$S$ and $D$. $D$ might trust $S$ or not, and thus accept or reject
-the capability that $C$ wants to give to $D$. $S$ does not trust
-either $C$ or $D$.
-
-The \texttt{task} server is also involved, because it provides the
-task info capabilities. Everyone trusts the \texttt{task} server they
-use. This does not need to be the same one for every participant.
-
-FIXME: Here should be the pseudo code for the protocol. For now, you
-have to take it out of the long version.
-
-\begin{enumerate}
-\item The client invokes the \verb/cap_ref_cont_create/ RPC on the
- capability, providing the task ID of the intended receiver $D$ of
- the capability.
-
-\item The server receives the \verb/cap_ref_cont_create/ RPC from the
- client. It requests a task info cap for $D$ from its trusted task
- server, under the constraint that $C$ is still living.
-
- \begin{comment}
- A task can provide a constraint when creating a task info cap in
- the \texttt{task} server. The constraint is a task ID. The task
- server will only create the task info cap and return it if the
- task with the constraint task ID is not destroyed. This allows
- for a task requesting a task info capability to make sure that
- another task, which also holds this task info cap, is not
- destroyed. This is important, because if a task is destroyed, all
- the task info caps it held are released.
-
- In this case, the server relies on the client to hold a task info
- cap for $D$ until it established its own. See below for what can
- go wrong if the server would not provide a constraint and both,
- the client and the destination task would die unexpectedly.
- \end{comment}
-
- Now that the server established its own task info cap for $D$, it
- creates a reference container for $D$, that has the following
- properties:
-
- \begin{itemize}
- \item The reference container has a single new reference for the
- capability.
-
- \item The reference container has an ID that is unique among all
- reference container IDs for the client $C$.
-
- \item The reference container is associated with the client $C$. If
- $C$ dies, and the server processes the task death notification for
- it, the server will destroy the reference container and release
- the capability reference it has (if any). All resources
- associated with the reference container will be released. If this
- reference container was the only reason for $S$ to hold the task
- info cap for $D$, the server will also release the task info cap
- for $D$.
-
- \item The reference container is also associated with the
- destination task $D$. If $D$ dies, and the server processes the
- task death notification for it, the server will release the
- capability reference that is in the reference container (if any).
- It will not destroy the part of the container that is associated
- with $C$.
- \end{itemize}
-
- The server returns the reference container ID $R$ to the client.
-
-\item The client receives the reference container ID $R$.
-
- \begin{comment}
- If several capabilities have to be copied in one message, the
- above steps need to be repeated for each capability. With
- appropriate interfaces, capabilities could be collected so that
- only one call per server has to be made. We are assuming here
- that only one capability is copied.
- \end{comment}
-
-\item The client sends the server thread ID $T$ and the reference
- container ID $R$ to the destination task $D$.
-
-\item The destination task $D$ receives the server thread ID $T$ and
- the reference container ID $R$ from $C$.
-
- It now inspects the server thread ID $T$, and in particular the task
- ID component of it. $D$ has to make the decision if it trusts this
- task to be a server for it, or if it does not trust this task.
-
- If $D$ trusts $C$, it might decide to always trust $T$, too,
- irregardless of what task contains $T$.
-
- If $D$ does not trust $C$, it might be more picky about the task
- that contains $T$. This is because $D$ will have to become a client
- of $T$, so it will trust it. For example, it will block on messages
- it sends to $T$.
-
- \begin{comment}
- If $D$ is a server, it will usually only accept capabilities from
- its client that are provided by specific other servers it trusts.
- This can be the authentication server, for example (see section
- \ref{auth} on page \pageref{auth}).
-
- Usually, the type of capability that $D$ wants to accept from $C$
- is then further restricted, and only one possible trusted server
- implements that type of capabilities. Thus, $D$ can simply
- compare the task ID of $T$ with the task ID of its trusted server
- (authentication server, ...) to make the decision if it wants to
- accept the capability or not.
- \end{comment}
-
- If $D$ does not trust $T$, it replies to $C$ (probably with an error
- value indicating why the capability was not accepted). In that
- case, jump to step \ref{copycapout}.
-
- Otherwise, it requests a task info cap for $S$ from its trusted task
- server, under the constraint that $C$ is still living.
-
- Then $D$ sends a \verb/cap_ref_cont_accept/ RPC to the server $S$,
- providing the task ID of the client $C$ and the reference container
- ID $R$.
-
-\begin{comment}
- \verb/cap_ref_cont_accept/ is one of the few interfaces that is not
- sent to a (real) capability, of course. Nevertheless, it is part of
- the capability object interface, hence the name. You can think of
- it as a static member in the capability class, that does not require
- an instance of the class.
-\end{comment}
-
-\item The server receives the \verb/cap_ref_cont_accept/ RPC from the
- destination task $D$. It verifies that a reference container exists
- with the ID $R$, that is associated with $D$ and $C$.
-
- \begin{comment}
- The server will store the reference container in data structures
- associated with $C$, under an ID that is unique but local to $C$.
- So $D$ needs to provide both information, the task ID and the
- reference container ID of $C$.
- \end{comment}
-
- If that is the case, it takes the reference from the reference
- container, and creates a capability ID for $D$ from it. The
- capability ID for $D$ is returned in the reply message.
-
- From that moment on, the reference container is deassociated from
- $D$. It is still associated with $C$, but it does not contain any
- reference for the capability.
-
- \begin{comment}
- It is not deassociated from $C$ and removed completely, so that
- its ID $R$ (or at least the part of it that is used for $C$) is
- not reused. $C$ must explicitely destroy the reference container
- anyway because $D$ might die unexpectedly or return an error that
- gives no indication if it accepted the reference or not.
- \end{comment}
-
-\item The destination task $D$ receives the capability ID and enters
- it into its capability system. It sends a reply message to $C$.
-
- \begin{comment}
- If the only purpose of the RPC was to copy the capability, the
- reply message can be empty. Usually, capabilities will be
- transfered as part of a larger operation, though, and more work
- will be done by $D$ before returning to $C$.
- \end{comment}
-
-\item \label{copycapout} The client $C$ receives the reply from $D$.
- Irregardless if it indicated failure or success, it will now send
- the \verb/cap_ref_cont_destroy/ message to the server $S$, providing
- the reference container $R$.
-
- \begin{comment}
- This message can be a simple message. It does not require a reply
- from the server.
- \end{comment}
-
-\item The server receives the \verb/cap_ref_cont_destroy/ message and
- removes the reference container $R$. The reference container is
- deassociated from $C$ and $D$. If this was the only reason that $S$
- held a task info cap for $D$, this task info cap is also released.
-
- \begin{comment}
- Because the reference container can not be deassociated from $C$
- by any other means than this interface, the client does not need
- to provide $D$. $R$ can not be reused without the client $C$
- having it destroyed first. This is different from the
- \verb/cap_ref_cont_accept/ call made by $D$, see above.
- \end{comment}
-
-\end{enumerate}
-
-\paragraph{Result}
-For the client $C$, nothing has changed. The destination task $D$
-either did not accept the capability, and nothing has changed for it,
-and also not for the server $S$. Or $D$ accepted the capability, and
-it now has a task info cap for $S$ and a reference to the capability
-provided by $S$. In this case, the server $S$ has a task info cap for
-$D$ and provides a capability ID for this task.
-
-The above protocol is for copying a capability from $C$ to $D$. If
-the goal was to move the capability, then $C$ can now release its
-reference to it.
-
-\begin{comment}
- Originally we considered to move capabilities by default, and
- require the client to acquire an additional reference if it wanted
- to copy it instead. However, it turned out that for the
- implementation, copying is easier to handle. One reason is that the
- client usually will use local reference counting for the
- capabilities it holds, and with local reference counting, one
- server-side reference is shared by many local references. In that
- case, you would need to acquire a new server-side reference even if
- you want to move the capability. The other reason is cancellation.
- If an RPC is cancelled, and you want to back out of it, you need to
- restore the original situation. And that is easier if you do not
- change the original situation in the first place until the natural
- ``point of no return''.
-\end{comment}
-
-The above protocol quite obviously achieves the result as described in
-the above concluding paragraph. However, many other, and often
-simpler, protocols would also do that. The other protocols we looked
-at are not secure or robust though, or require more operations. To
-date we think that the above is the shortest (in particular in number
-of IPC operations) protocol that is also secure and robust (and if it
-is not we think it can be fixed to be secure and robust with minimal
-changes). We have no proof for its correctness. Our confidence comes
-from the scrutiny we applied to it. If you find a problem with the
-above protocol, or if you can prove various aspects of it, we would
-like to hear about it.
-
-To understand why the protocol is laid out as it is, and why it is a
-secure and robust protocol, one has to understand what could possibly
-go wrong and why it does not cause any problems for any participant if
-it follows its part of the protocol (independent on what the other
-participants do). In the following paragraphs, various scenarios are
-suggested where things do not go as expected in the above protocol.
-This is probably not a complete list, but it should come close to it.
-If you find any other problematic scenario, again, let us know.
-
-\begin{comment}
- Although some comments like this appear in the protocol description
- above, many comments have been spared for the following analysis of
- potential problems. Read the analysis carefully, as it provides
- important information about how, and more importantly, why it works.
-\end{comment}
-
-\paragraph{The server $S$ dies}
-What happens if the server $S$ dies unexpectedly sometime throughout
-the protocol?
-
-\begin{comment}
- At any time a task dies, the task info caps it held are released.
- Also, task death notifications are sent to any task that holds task
- info caps to the now dead task. The task death notifications will
- be processed asynchrnouly, so they might be processed immediately,
- or at any later time, even much later after the task died! So one
- important thing to keep in mind is that the release of task info
- caps a task held, and other tasks noticing the task death, are
- always some time apart.
-\end{comment}
-
-Because the client $C$ holds a task info cap for $S$ no imposter can
-get the task ID of $S$. $C$ and $D$ will get errors when trying to
-send messages to $S$.
-
-\begin{comment}
- You might now wonder what happens if $C$ also dies, or if $C$ is
- malicious and does not hold the task info cap. You can use this as
- an exercise, and try to find the answer on your own. The answers
- are below.
-\end{comment}
-
-Eventually, $C$ (and $D$ if it already got the task info cap for $S$)
-will process the task death notification and clean up their state.
-
-\paragraph{The client $C$ dies}
-The server $S$ and the destination task $D$ hold a task info cap for
-$C$, so no imposter can get its task ID. $S$ and $D$ will get errors
-when trying to send messages to $C$. Depending on when $C$ dies, the
-capability might be copied successfully or not at all.
-
-Eventually, $S$ and $D$ will process the task death notification and
-release all resources associated with $C$. If the reference was not
-yet copied, this will include the reference container associated with
-$C$, if any. If the reference was already copied, this will only
-include the empty reference container, if any.
-
-\begin{comment}
- Of course, the participants need to use internal locking to protect
- the integrity of their internal data structures. The above protocol
- does not show where locks are required. In the few cases where some
- actions must be performed atomically, a wording is used that
- suggests that.
-\end{comment}
-
-\paragraph{The destination task $D$ dies}
-
-The client $C$ holds a task info cap for $D$ over the whole operation,
-so no imposter can get its task ID. Depending on when $D$ dies, it
-has either not yet accepted the capability, then $C$ will clean up by
-destroying the reference container, or it has, and then $S$ will clean
-up its state when it processes the task death notification for $D$.
-
-\paragraph{The client $C$ and the destination task $D$ die}
-
-This scenario is the reason why the server acquires its own task info
-cap for $D$ so early, and why it must do that under the constraint
-that $C$ still lives. If $C$ and $D$ die before the server created
-the reference container, then either no request was made, or creating
-the task info cap for $D$ fails because of the constraint. If $C$ and
-$D$ die afterwards, then no imposter can get the task ID of $D$ and
-try to get at the reference in the container, because the server has
-its own task info cap for $D$.
-
-\begin{comment}
- This problem was identified very late in the development of this
- protocol. We just did not think of both clients dieing at the same
- time! In an earlier version of the protocol, the server would
- acquire its task info cap when $D$ accepts its reference. This is
- too late: If $C$ and $D$ die just before that, an imposter with
- $D$'s task ID can try to get the reference in the container before
- the server processes the task death notification for $C$ and
- destroys it.
-\end{comment}
-
-Eventually, the server will receive and process the task death
-notifications. If it processes the task death notification for $C$
-first, it will destroy the whole container immediately, including the
-reference, if any. If it processes the task death notification for
-$D$ first, it will destroy the reference, and leave behind the empty
-container associated with $C$, until the other task death notification
-is processed. Either way no imposter can get at the capability.
-
-Of course, if the capability was already copied at the time $C$ and
-$D$ die, the server will just do the normal cleanup.
-
-\paragraph{The client $C$ and the server $S$ die}
-
-This scenario does not cause any problems, because on the one hand,
-the destination task $D$ holds a task info cap for $C$, and it
-acquires its own task info cap for $S$. Although it does this quite
-late in the protocol, it does so under the constraint that $C$ still
-lives, which has a task info cap for $S$ for the whole time (until it
-dies). It also gets the task info cap for $S$ before sending any
-message to it. An imposter with the task ID of $S$, which it was
-possible to get because $C$ died early, would not receive any message
-from $D$ because $D$ uses $C$ as its constraint in acquireing the task
-info cap for $S$.
-
-\paragraph{The destination task $D$ and the server $S$ die}
-
-As $C$ holds task info caps for $S$ and $D$, there is nothing that can
-go wrong here. Eventually, the task death notifications are
-processed, but the task info caps are not released until the protocol
-is completed or aborted because of errors.
-
-\paragraph{The client $C$, the destination task $D$ and the server $S$ die}
-
-Before the last one of these dies, you are in one of the scenarios
-which already have been covered. After the last one dies, there is
-nothing to take care of anymore.
-
-\begin{comment}
- In this case your problem is probably not the capability copy
- protocol, but the stability of your software! Go fix some bugs.
-\end{comment}
-
-So far the scenarios where one or more of the participating tasks die
-unexpectedly. They could also die purposefully. Other things that
-tasks can try to do purposefully to break the protocol are presented
-in the following paragraphs.
-
-\begin{comment}
- A task that tries to harm other tasks by not following a protocol
- and behaving as other tasks might expect it is malicious. Beside
- security concerns, this is also an issue of robustness, because
- malicious behaviour can also be triggered by bugs rather than bad
- intentions.
-
- It is difficult to protect against malicious behaviour by trusted
- components, like the server $S$, which is trusted by both $C$ and
- $D$. If a trusted component is compromised or buggy, ill
- consequences for software that trusts it must be expected. Thus, no
- analysis is provided for scenarious involving a malicious or buggy
- server $S$.
-\end{comment}
-
-\paragraph{The client $C$ is malicious}
-
-If the client $C$ wants to break the protocol, it has numerous
-possibilities to do so. The first thing it can do is to provide a
-wrong destination task ID when creating the container. But in this
-case, the server will return an error to $D$ when it tries to accept
-it, and this will give $D$ a chance to notice the problem and clean
-up. This also would allow for some other task to receive the
-container, but the client can give the capability to any other task it
-wants to anyway, so this is not a problem.
-
-\begin{comment}
- If a malicious behaviour results in an outcome that can also be
- achieved following the normal protocol with different parameters,
- then this not a problem at all.
-\end{comment}
-
-The client could also try to create a reference container for $D$ and
-then not tell $D$ about it. However, a reference container should not
-consume a lot of resources in the server, and all such resources
-should be attributed to $C$. When $C$ dies eventually, the server
-will clean up any such pending containers when the task death
-notification is processed.
-
-The same argument holds when $C$ leaves out the call to
-\verb/cap_ref_cont_destroy/.
-
-The client $C$ could also provide wrong information to $D$. It could
-supply a wrong server thread ID $T$. It could supply a wrong
-reference container ID $R$. If $D$ does not trust $C$ and expects a
-capability implemented by some specific trusted server, it will verify
-the thread ID numerically and reject it if it does not match. The
-reference container ID will be verified by the server, and it will
-only be accepted if the reference container was created by the client
-task $C$. Thus, the only wrong reference container IDs that the
-client $C$ could use to not provoke an error message from the server
-(which then lead $D$ to abort the operation) would be a reference
-container that it created itself in the first place. However, $C$
-already is frree to send $D$ any reference container it created.
-
-\begin{comment}
- Again $C$ can not achieve anything it could not achieve by just
- following the protocol as well. If $C$ tries to use the same
- reference container with several RPCs in $D$, one of them would
- succeed and the others would fail, hurting only $C$.
-
- If $D$ does trust $C$, then it can not protect against malicious
- behaviour by $C$.
-\end{comment}
-
-To summarize the result so far: $C$ can provide wrong data in the
-operations it does, but it can not achieve anything this way that it
-could not achieve by just following the protocol. In most cases the
-operation would just fail. If it leaves out some operations, trying
-to provoke resource leaks in the server, it will only hurt itself (as
-the reference container is strictly associated with $C$ until the
-reference is accepted by $D$).
-
-\begin{comment}
- For optimum performance, the server should be able to keep the
- information about the capabilities and reference containers a client
- holds on memory that is allocated on the clients behalf.
-
- It might also use some type of quota system.
-\end{comment}
-
-Another attack that $C$ can attempt is to deny a service that $S$ and
-$D$ are expecting of it. Beside not doing one or more of the RPCs,
-this is in particular holding the task info caps for the time span as
-described in the protocol. Of course, this can only be potentially
-dangerous in combination with a task death. If $C$ does not hold the
-server task info capability, then an imposter of $S$ could trick $D$
-into using the imposter as the server. However, this is only possible
-if $D$ already trusts $C$. Otherwise it would only allow servers that
-it already trusts, and it would always hold task info caps to such
-trusted servers when making the decision that it trusts them.
-However, if $D$ trusts $C$, it can not protect against $C$ being
-malicious.
-
-\begin{comment}
- If $D$ does not trust $C$, it should only ever compare the task ID
- of the server thread against trusted servers it has a task info cap
- for. It must not rely on $C$ doing that for $D$.
-
- However, if $D$ does trust $C$, it can rely on $C$ holding the
- server task info cap until it got its own. Thus, the task ID of $C$
- can be used as the constraint when acquiring the task info cap in
- the protocol.
-\end{comment}
-
-If $C$ does not hold the task info cap of $D$, and $D$ dies before the
-server acquires its task info cap for $D$, it might get a task info
-cap for an imposter of $D$. But if the client wants to achieve that,
-it could just follow the protocol with the imposter as the destination
-task.
-
-\paragraph{The destination task $D$ is malicious}
-
-The destination task has not as many possibilities as $C$ to attack
-the protocol. This is because it is trusted by $C$. So the only
-participant that $D$ can try to attack is the server $S$. But the
-server $S$ does not rely on any action by $D$. $D$ does not hold any
-task info caps for $S$. The only operation it does is an RPC to $S$
-accepting the capability, and if it omits that it will just not get
-the capability (the reference will be cleaned up by $C$ or by the
-server when $C$ dies).
-
-The only thing that $D$ could try is to provide false information in
-the \verb/cap_ref_cont_accept/ RPC. The information in that RPC is
-the task ID of the client $C$ and the reference container ID $R$. The
-server will verify that the client $C$ has previously created a
-reference container with the ID $R$ that is destined for $D$. So $D$
-will only be able to accept references that it is granted access to.
-So it can not achieve anything that it could not achieve by following
-the protocol (possibly the protocol with another client). If $D$
-accepts capabilities from other transactions outside of the protocol,
-it can only cause other transactions in its own task to fail.
-
-\begin{comment}
- If you can do something wrong and harm yourself that way, then this
- is called ``shooting yourself in your foot''.
-
- The destination task $D$ is welcome to shoot itself in its foot.
-\end{comment}
-
-\paragraph{The client $C$ and the destination task $D$ are malicious}
-
-The final question we want to raise is what can happen if the client
-$C$ and the destination task $D$ are malicious. Can $C$ and $D$
-cooperate and attacking $S$ in a way that $C$ or $D$ alone could not?
-
-In the above analysis, there is no place where we assume any specific
-behaviour of $D$ to help $S$ in preventing an attack on $S$. There is
-only one place where we make an assumption for $C$ in the analysis of
-a malicious $D$. If $D$ does not accept a reference container, we
-said that $C$ would clean it up by calling
-\verb/cap_ref_cont_destroy/. So we have to look at what would happen
-if $C$ were not to do that.
-
-Luckily, we covered this case already. It is identical to the case
-where $C$ does not even tell $D$ about the reference container and
-just do nothing. In this case, as said before, the server will
-eventually release the reference container when $C$ dies. Before
-that, it only occupies resources in the server that are associated
-with $C$.
-
-This analysis is sketchy in parts, but it covers a broad range of
-possible attacks. For example, all possible and relevant combinations
-of task deaths and malicious tasks are covered. Although by no means
-complete, it can give us some confidence about the rightness of the
-protocol. It also provides a good set of test cases that you can test
-your own protocols, and improvements to the above protocol against.
-
-
-\subsubsection{The trust rule}
-
-The protocol to copy a capability from one client to another task has
-a dramatic consequence on the design of the Hurd interfaces.
-
-Because the receiver of the capability must make blocking calls to the
-server providing the capability, the receiver of the capability
-\emph{must} trust the server providing the capability.
-
-This means also: If the receiver of a capability does not trust the
-server providing the capability, it \emph{must not} accept it.
-
-The consequence is that normally, servers can not accept capabilities
-from clients, unless they are provided by a specific trusted server.
-This can be the \texttt{task} or \texttt{auth} server for example.
-
-This rule is even true if the receiver does not actually want to use
-the capability for anything. Just accepting the capability requires
-trusting the server providing it already.
-
-In the Hurd on Mach, ports (which are analogous to capabilities in
-this context) can be passed around freely. There is no security risk
-in accepting a port from any source, because the kernel implements
-them as protected objects. Using a port by sending blocking messages
-to it requires trust, but simply storing the port on the server side
-does not.
-
-This is different in the Hurd on L4: A server must not accept
-capabilities unless it trusts the server providing them. Because
-capabilities are used for many different purposes (remote objects,
-authentication, identification), one has to be very careful in
-designing the interfaces. The Hurd interfaces on Mach use ports in a
-way that is not possible on L4. Such interfaces need to be
-redesigned.
-
-Often, redesigning such an interface also fixes some other security
-problems that exists with in the Hurd on L4, in particular DoS
-attacks. A good part of this paper is about redesigning the Hurd to
-avoid storing untrusted capabilities on the server side.
-
-\begin{comment}
- Examples are:
-
- \begin{itemize}
- \item The new authentication protocol, which eliminates the need for
- a rendezvous port and is not only faster, but also does not
- require the server to block on the client anymore (see section
- \ref{auth} on page \pageref{auth}).
-
- \item The signal handling, which does not require the \texttt{proc}
- server to hold the signal port for every task anymore (see section
- \ref{signals} on page \pageref{signals}).
-
- \item The new exec protocol, which eliminates the need to pass all
- capabilities that need to be transfered to the new executable from
- the old program to the filesystem server, and then to the
- \texttt{exec} server (see section \ref{exec} on page
- \pageref{exec}).
-
- \item The new way to implement Unix Domain Sockets, which don't
- require a trusted system server, so that descriptor passing (which
- is really capability passing) can work (see section
- \ref{unixdomainsockets} on page \pageref{unixdomainsockets}.
-
- \item The way parent and child filesystem are linked to each other,
- in other words: how mounting a filesystem works (see section
- \ref{xfslookup} on page \pageref{xfslookup}).
-
- \item The replacement for the \verb/file_reparent()/ RPC (see
- section \ref{reparenting} on page \pageref{reparenting}).
- \end{itemize}
-\end{comment}
-
-\subsection{Synchronous IPC}
-
-The Hurd only needs synchronous IPC. Asynchronous IPC is usually not
-required. An exception are notifications (see below).
-
-There are possibly some places in the Hurd source code where
-asynchronous IPC is assumed. These must be replaced with different
-strategies. One example is the implementation of select() in the GNU
-C library.
-
-\begin{comment}
- A naive implementation would use one thread per capability to select
- on. A better one would combine all capabilities implemented by the
- same server in one array and use one thread per server.
-
- A more complex scheme might let the server process select() calls
- asynchronously and report the result back via notifications.
-\end{comment}
-
-In other cases the Hurd receives the reply asynchronously from sending
-the message. This works fine in Mach, because send-once rights are
-used as reply ports and Mach guarantees to deliver the reply message,
-ignoring the kernel queue limit. In L4, no messages are queued and
-such places need to be rewritten in a different way (for example using
-extra threads).
-
-\begin{comment}
- What happens if a client does not go into the receive phase after a
- send, but instead does another send, and another one, quickly many
- sends, as fast as possible? A carelessly written server might
- create worker threads for each request. Instead, the server should
- probably reject to accept a request from a client thread that
- already has a pending request, so the number of worker threads is
- limited to the number of client threads.
-
- This also makes interrupting an RPC operation easier (the client
- thread ID can be used to identify the request to interrupt).
-\end{comment}
-
-
-\subsection{Notifications}
-
-Notifications to untrusted tasks happen frequently. One case is
-object death notifications, in particular task death notifications.
-Other cases might be select() or notifications of changes to the
-filesystem.
-
-The console uses notifications to broadcast change events to the
-console content, but it also uses shared memory to broadcast the
-actual data, so not all notifications need to be received for
-functional operation. Still, at least one notification is queued by
-Mach, and this is sufficient for the console to wakeup whenever
-changes happened, even if the changes can not be processed
-immediately.
-
-From the servers point of view, notifications are simply messages with
-a send and xfer timeout of 0 and without a receive phase.
-
-For the client, however, there is only one way to ensure that it will
-receive the notification: It must have the receiving thread in the
-receive phase of an IPC. While this thread is processing the
-notification (even if it is only delegating it), it might be preempted
-and another (or the same) server might try to send a second
-notification.
-
-\begin{comment}
- It is an open challenge how the client can ensure that it either
- receives the notification or at least knows that it missed it, while
- the server remains save from potential DoS attacks. The usual
- strategy, to give receivers of notifications a higher scheduling
- priority than the sender, is not usable in a system with untrusted
- receivers (like the Hurd). The best strategy determined so far is
- to have the servers retry to send the notification several times
- with small delays inbetween. This can increase the chance that a
- client is able to receive the notification. However, there is still
- the question what a server can do if the client is not ready.
-
- An alternative might be a global trusted notification server that
- runs at a higher scheduling priority and records which servers have
- notifications for which clients, and that can be used by clients to
- be notified of pending notifications. Then the clients can poll the
- notifications from the servers.
-\end{comment}
-
-
-\section{Threads and Tasks}
-
-The \texttt{task} server will provide the ability to create tasks and
-threads, and to destroy them.
-
-\begin{comment}
- In L4, only threads in the privileged address space (the rootserver)
- are allowed to manipulate threads and address spaces (using the
- \textsc{ThreadControl} and \textsc{SpaceControl} system calls). The
- \texttt{task} server will use the system call wrappers provided by
- the rootserver, see section \ref{rootserver} on page
- \pageref{rootserver}.
-\end{comment}
-
-The \texttt{task} server provides three different capability types.
-
-\paragraph{Task control capabilities}
-If a new task is created, it is always associated with a task control
-capability. The task control capability can be used to create and
-destroy threads in the task, and destroy the task itself. So the task
-control capability gives the owner of a task control over it. Task
-control capabilities have the side effect that the task ID of this
-task is not reused, as long as the task control capability is not
-released. Thus, having a task control capability affects the global
-namespace of task IDs. If a task is destroyed, task death
-notifications are sent to holders of task control capabilities for
-that task.
-
-\begin{comment}
- A task is also implicitely destroyed when the last task control
- capability reference is released.
-\end{comment}
-
-\paragraph{Task info capabilities}
-\label{taskinfocap}
-Any task can create task info capabilities for other tasks. Such task
-info capabilities are used mainly in the IPC system (see section
-\ref{ipc} on page \pageref{ipc}). Task info capabilities have the
-side effect that the task ID of this task is not reused, as long as
-the task info capability is not released. Thus, having a task info
-capability affects the global namespace of task IDs. If a task is
-destroyed, task death notifications are sent to holders of task info
-capabilities for that task.
-
-\begin{comment}
- Because of that, holding task info capabilities must be restricted
- somehow. Several strategies can be taken:
-
- \begin{itemize}
- \item Task death notifications can be monitored. If there is no
- acknowdgement within a certain time period, the \texttt{task}
- server could be allowed to reuse the task ID anyway. This is not
- a good strategy because it can considerably weaken the security of
- the system (capabilities might be leaked to tasks which reuse such
- a task ID reclaimed by force).
- \item The proc server can show dead task IDs which are not released
- yet, in analogy to the zombie processes in Unix. It can also make
- available the list of tasks which prevent reusing the task ID, to
- allow users or the system administrator to clean up manually.
- \item Quotas can be used to punish users which do not acknowledge
- task death timely. For example, if the number of tasks the user
- is allowed to create is restricted, the task info caps that the
- user holds for dead tasks could be counted toward that limit.
- \item Any task could be restricted to as many task ID references as
- there are live tasks in the system, plus some slack. That would
- prevent the task from creating new task info caps if it does not
- release old ones from death tasks. The slack would be provided to
- not unnecessarily slow down a task that processes task death
- notifications asynchronously to making connections with new tasks.
- \end{itemize}
-
- In particular the last two approaches should proof to be effective
- in providing an incentive for tasks to release task info caps they
- do not need anymore.
-\end{comment}
-
-\paragraph{Task manager capability}
-A task is a relatively simple object, compared to a full blown POSIX
-process, for example. As the \texttt{task} server is enforced system
-code, the Hurd does not impose POSIX process semantics in the task
-server. Instead, POSIX process semantics are implemented in a
-different server, the proc server (see also section \ref{proc} on page
-\pageref{proc}). To allow the \texttt{proc} server to do its work, it
-needs to be able to get the task control capability for any task, and
-gather other statistics about them. Furthermore, there must be the
-possibility to install quota mechanisms and other monitoring systems.
-The \texttt{task} server provides a task manager capability, that
-allows the holder of that capability to control the behaviour of the
-\texttt{task} server and get access to the information and objects it
-provides.
-
-\begin{comment}
- For example, the task manager capability could be used to install a
- policy capability that is used by the \texttt{task} server to make
- upcalls to a policy server whenever a new task or thread is created.
- The policy server could then indicate if the creation of the task or
- thread is allowed by that user. For this to work, the \texttt{task}
- server itself does not need to know about the concept of a user, or
- the policies that the policy server implements.
-
- Now that I am writing this, I realize that without any further
- support by the \texttt{task} server, the policy server would be
- restricted to the task and thread ID of the caller (or rather the
- task control capability used) to make its decision. A more
- capability oriented approach would then not be possible. This
- requires more thought.
-
- The whole task manager interface is not written yet.
-\end{comment}
-
-When creating a new task, the \texttt{task} server allocates a new
-task ID for it. The task ID will be used as the version field of the
-thread ID of all threads created in the task. This allows the
-recipient of a message to verify the sender's task ID efficiently and
-easily.
-
-\begin{comment}
- The version field is 14 bit on 32-bit architectures, and 32 bit on
- 64 bit architectures. Because the lower six bits must not be all
- zero (to make global thread IDs different from local thread IDs),
- the number of available task IDs is $2^{14} - 2^6$ resp. $2^{32} -
- 2^6$.
-
- If several systems are running in parallel on the same host, they
- might share thread IDs by encoding the system ID in the upper bits
- of the thread number.
-\end{comment}
-
-Task IDs will be reused only if there are no task control or info
-capabilities for that task ID held by any task in the system. To
-support bootstrapping an IPC connection (see section
-\ref{ipcbootstrap} on page \pageref{ipcbootstrap}), the \texttt{task}
-server will delay reusing a task ID as long as possible.
-
-\begin{comment}
- This is similar to how PIDs are generated in Unix. Although it is
- attempted to keep PIDs small for ease of use, PIDs are not reused
- immediately. Instead, the PID is incremented up to a certain
- maximum number, and only then smaller PID values are reused again.
-
- As task IDs are not a user interface, there is no need to keep them
- small. The whole available range can be used to delay reusing a
- task ID as long as possible.
-\end{comment}
-
-When creating a new task, the \texttt{task} server also has to create
-the initial thread. This thread will be inactive. Once the creation
-and activation of the initial thread has been requested by the user,
-it will be activated. When the user requests to destroy the last
-thread in a task, the \texttt{task} server makes that thread inactive
-again.
-
-\begin{comment}
- In L4, an address space can only be implicitely created (resp.
- destroyed) with the first (resp. last) thread in that address space.
-\end{comment}
-
-Some operations, like starting and stopping threads in a task, can not
-be supported by the task server, but have to be implemented locally in
-each task because of the minimality of L4. If external control over
-the threads in a task at this level is required, the debugger
-interface might be used (see section \ref{debug} on page
-\pageref{debug}).
-
-
-\subsection{Accounting}
-
-We want to allow the users of the system to use the \texttt{task}
-server directly, and ignore other task management facilities like the
-\texttt{proc} server. However, the system administrator still needs
-to be able to identify the user who created such anonymous tasks.
-
-For this, a simple accounting mechanism is provided by the task
-server. An identifier can be set for a task by the task manager
-capability, which is inherited at task creation time from the parent
-task. This accounting ID can not be changed without the task manager
-capability.
-
-The \texttt{proc} server sets the accounting ID to the process ID
-(PID) of the task whenever a task registers itself with the
-\texttt{proc} server. This means that all tasks which do not register
-themself with the \texttt{proc} server will be grouped together with
-the first parent task that did. This allows to easily kill all
-unregistered tasks together with its registered parent.
-
-The \texttt{task} server does not interpret or use the accounting ID
-in any way.
-
-
-\subsection{Proxy Task Server}
-\label{proxytaskserver}
-
-The \texttt{task} server can be safely proxied, and the users of such
-a proxy task server can use it like the real \texttt{task} server,
-even though capabilities work a bit differently for the \texttt{task}
-server than for other servers.
-
-The problem exists because the proxy task server would hold the real
-task info capabilities for the task info capabilities that it provides
-to the proxied task. So if the proxy task server dies, all such task
-info capabilities would be released, and the tasks using the proxy
-task server would become insecure and open to attacks by imposters.
-
-However, this is not really a problem, because the proxy task server
-will also provide proxy objects for all task control capabilities. So
-it will be the only task which holds task control capabilities for the
-tasks that use it. When the proxy task server dies, all tasks that
-were created with it will be destroyed when these tak control
-capabilities are released. The proxy task server is a vital system
-component for the tasks that use it, just as the real \texttt{task}
-server is a vital system component for the whole system.
-
-
-\subsection{Scheduling}
-
-The task server is the natural place to implement a simple, initial
-scheduler for the Hurd. A first version can at least collect some
-information about the cpu time of a task and its threads. Later a
-proper scheduler has to be written that also has SMP support.
-
-The scheduler should run at a higher priority than normal threads.
-
-\begin{comment}
- This might require that the whole task server must run at a higher
- priority, which makes sense anyway.
-
- Not much thought has been given to the scheduler so far. This is
- work that still needs to be done.
-\end{comment}
-
-There is no way to get at the ``system time'' in L4, it is assumed
-that no time is spent in the kernel (which is mostly true). So system
-time will always be reported as $0.00$, or $0.01$.
-
-
-\section{Virtual Memory Management}
-
-Traditionally, monolithical kernels, but even kernels like Mach,
-provide a virtual memory management system in the kernel. All paging
-decisions are made by the kernel itself. This requires good
-heuristics. Smart paging decisions are often not possible because the
-kernel lacks the information about how the data is used.
-
-In the Hurd, paging will be done locally in each task. A physical
-memory server provides a number of guaranteed physical pages to tasks.
-It will also provide a number of excess pages (over-commit). The task
-might have to return any number of excess pages on short notice. If
-the task does not comply, all mappings are revoked (essentially
-killing the task).
-
-A problem arises when data has to be exchanged between a client and a
-server, and the server wants to have control over the content of the
-pages (for example, pass it on to other servers, like device drivers).
-The client can not map the pages directly into the servers address
-space, as it is not trusted. Container objects created in the
-physical memory server and mapped into the client and/or the servers
-address space will provide the necessary security features to allow
-this. This can be used for DMA and zero-copying in the data exchange
-between device drivers and (untrusted) user tasks.
-
-
-\section{Authentication}
-\label{auth}
-
-Capabilities are a good way to give access to protected objects and
-services. They are flexible, lightweight and generic. However, Unix
-traditionally uses access control lists (ACL) to restrict access to
-objects like files. Any task running with a certain user ID can
-access all files that are readable for the user with that user ID.
-Although all objects are implemented as capabilities in the Hurd, the
-Hurd also supports the use of user IDs for access control.
-
-The system authentication server \texttt{auth} implements the Unix
-authentication scheme using capabilities. It provides auth
-capabilities, which are associated with a list of effective and
-available user and group IDs. The holder of such a capability can use
-it to authenticate itself to other servers, using the protocol below.
-
-Of course, these other servers must use (and trust) the same
-\texttt{auth} server as the user. Otherwise, the authentication will
-fail. Once a capability is authenticated in the server, the server
-will know the user IDs of the client, and can use them to validate
-further operations.
-
-The \texttt{auth} server provides two types of capabilities:
-
-\paragraph{Auth capabilities}
-An auth capability is associated with four vectors of IDs: The
-effective user and group IDs, which should be used by other servers to
-authenticate operations that require certain user or group IDs, and
-the available user and group IDs. Available IDs should not be used
-for authentication purposes, but can be turned into effective IDs by
-the holder of an auth capability at any time.
-
-New auth capabilities can be created from existing auth capabilities,
-but only if the requested IDs are a subsets from the union of the
-(effective and available) IDs in the provided auth capabilities. If
-an auth capability has an effective or available user ID 0, then
-arbitrary new auth objects can be created from that.
-
-\paragraph{Passport capabilities}
-A passport capability can be created from an auth capability and is
-only valid for the task that created it. It can be provided to a
-server in an authentication process (see below). For the client, the
-passport capability does not directly implement any useful operation.
-For the server, it can be used to verify the identity of a user and
-read out the effective user and group IDs.
-
-The auth server should always create new passport objects for
-different tasks, even if the underlying auth object is the same, so
-that a task having the passport capability can not spy on other tasks
-unless they were given the passport capability by that task.
-
-\subsection{Authenticating a client to a server}
-
-A client can authenticate itself to a server with the following
-protocol:
-
-\paragraph{Preconditions}
-The client $C$ has an auth capability implemented by the \texttt{auth}
-server $A$. It also has a capability implemented by the server $S$.
-It wants to reauthenticate this capability with the auth capability,
-so the server associates the new user and group IDs with it.
-
-The server also has an auth capability implemented by its trusted
-\texttt{auth} server. For the reauthentication to succeed, the
-\texttt{auth} server of the client and the server must be identical.
-If this is the case, the participating tasks hold task info caps for
-all other participating tasks (because of the capabilities they hold).
-
-\begin{enumerate}
-\item The client $C$ requests the passport capability for itself from
- the auth capability from $A$.
-
- \begin{comment}
- Normally, the client will request the passport capability only
- once and store it together with the auth capability.
- \end{comment}
-
-\item The \texttt{auth} server receives the request and creates a new
- passport capability for this auth capability and this client. The
- passport capability is returned to the user.
-
-\item The user receives the reply from the \texttt{auth} server.
-
- It then sends the reauthentication request to the server $S$, which
- is invoked on the capability the client wants to reauthenticate. It
- provides the passport capability as an argument.
-
-\item The server $S$ can accept the passport capability, if it
- verifies that it is really implemented by the \texttt{auth} server
- it trusts. If the client does not provide a passport capability to
- the trusted \texttt{auth} server, the authentication process is
- aborted with an error.
-
- Now the server can send a request to the \texttt{auth} server to
- validate the passport capability. The RPC is invoked on the
- passport capability.
-
-\item The \texttt{auth} server receives the validation request on the
- passport capability and returns the task ID of the client $C$ that
- this passport belongs to, and the effective user and group IDs for
- the auth cap to which this passport cap belongs.
-
- \begin{comment}
- The Hurd on Mach returned the available IDs as well. This feature
- is not used anywhere in the Hurd, and as the available IDs should
- not be used for authentication anyway, this does not seem to be
- useful. If it is needed, it can be added in an extended version
- of the validation RPC.
- \end{comment}
-
-\item The server receives the task ID and the effective user and group
- IDs. The server now verifies that the task ID is the same as the
- task ID of the sender of the reauthentication request. Only then
- was the reauthentication request made by the owner of the auth cap.
- It can then return a new capability authenticated with the new user
- and group IDs.
-
- \begin{comment}
- The verification of the client's task ID is necessary. As the
- passport cap is copied to other tasks, it can not serve as a proof
- of identity alone. It is of course absolutely crucial that the
- server holds the task info cap for the client task $C$ for the
- whole time of the protocol. But the same is actually true for any
- RPC, as the server needs to be sure that the reply message is sent
- to the sender thread (and not any imposter).
- \end{comment}
-
-\item The client receives the reply with the new, reauthenticated
- capability. Usually this capability is associated in the server
- with the same abstract object, but different user credentials.
-
- \begin{comment}
- Of course a new capability must be created. Otherwise, all other
- users holding the same capability would be affected as well.
- \end{comment}
-
- The client can now deallocate the passport cap.
-
- \begin{comment}
- As said before, normally the passport cap is cached by the client
- for other reauthentications.
- \end{comment}
-\end{enumerate}
-
-\paragraph{Result}
-The client $C$ has a new capability that is authenticated with the new
-effective user and group IDs. The server has obtained the effective
-user and group IDs from the \texttt{auth} server it trusts.
-
-\begin{comment}
- The Hurd on Mach uses a different protocol, which is more complex
- and is vulnerable to DoS attacks. The above protocol can not
- readily be used on Mach, because the sender task of a message can
- not be easily identified.
-\end{comment}
-
-
-\section{The POSIX personality}
-
-The Hurd offers a POSIX API to the user by default. This is
-implemented in the GNU C library which uses the services provided by
-the Hurd servers. Several system servers support the C library.
-
-
-\subsection{Process Management}
-\label{proc}
-
-The \texttt{proc} server implements Unix process semantics in the Hurd
-system. It will also assign a PID to each task that was created with
-the \texttt{task} server, so that the owner of these tasks, and the
-system administrator, can at least send the \verb/SIGKILL/ signal to
-them.
-
-The \texttt{proc} server uses the task manager capability from the
-\texttt{task} server to get hold of the information about all tasks
-and the task control caps.
-
-\begin{comment}
- The \texttt{proc} server might also be the natural place to
- implement a first policy server for the \texttt{task} server.
-\end{comment}
-
-
-\subsubsection{Signals}
-\label{signals}
-
-Each process can register the thread ID of a signal thread with the
-\texttt{proc} server. The proc server will give the signal thread ID
-to any other task which asks for it.
-
-\begin{comment}
- The thread ID can be guessed, so there is no point in protecting it.
-\end{comment}
-
-The signal thread ID can then be used by a task to contact the task to
-which it wants to send a signal. The task must bootstrap its
-connection with the intended receiver of the signal, according to the
-protocol described in section \ref{ipcbootstrap} on page
-\pageref{ipcbootstrap}. As a result, it will receive the signal
-capability of the receiving task.
-
-The sender of a signal must then provide some capability that proves
-that the sender is allowed to send the signal when a signal is posted
-to the signal capability. For example, the owner of the task control
-cap is usually allowed to send any signal to it. Other capabilities
-might only give permission to send some types of signals.
-
-\begin{comment}
- The receiver of the signal decides itself which signals to accept
- from which other tasks. The default implementation in the C library
- provides POSIX semantics, plus some extensions.
-\end{comment}
-
-Signal handling is thus completely implemented locally in each task.
-The \texttt{proc} server only serves as a name-server for the thread
-IDs of the signal threads.
-
-\begin{comment}
- The \texttt{proc} server can not hold the signal capability itself,
- as it used to do in the implementation on Mach, as it does not trust
- the tasks implementing the capability. But this is not a problem,
- as the sender and receiver of a signal can negotiate and bootstrap
- the connection without any further support by the \texttt{proc}
- server.
-
- Also, the \texttt{proc} server can not even hold task info caps to
- support the sender of a signal in bootstrapping the connection.
- This means that there is a race between looking up the signal thread
- ID from the PID in the \texttt{proc} server and acquiring a task
- info cap for the task ID of the signal receiver in the sender.
- However, in Unix, there is always a race when sending a signal using
- \verb/kill/. The task server helps the users a bit here by not
- reusing task IDs as long as possible.
-\end{comment}
-
-Some signals are not implemented by sending a message to the task.
-\verb/SIGKILL/ for example destroys the tasks without contacting it at
-all. This feature is implemented in the \texttt{proc} server.
-
-The signal capability is also used for other things, like the message
-interface (which allows you to manipulate the environment variables
-and \texttt{auth} capability of a running task, etc).
-
-
-\subsubsection{The \texttt{fork()} function}
-
-To be written.
-
-
-\subsubsection{The \texttt{exec()} function}
-\label{exec}
-
-The \texttt{exec()} operation will be done locally in a task.
-Traditionally, \texttt{exec()} overlays the same task with a new
-process image, because creating a new task and transferring the
-associated state is expensive. In L4, only the threads and virtual
-memory mappings are actually kernel state associated with a task, and
-exactly those have to be destroyed by \texttt{exec()} anyway. There
-is a lot of Hurd specific state associated with a task (capabilities,
-for example), but it is difficult to preserve that. There are
-security concerns, because POSIX programs do not know about Hurd
-features like capabilities, so inheriting all capabilities across
-\texttt{exec()} unconditionally seems dangerous.
-
-\begin{comment}
- One could think that if a program is not Hurd-aware, then it will
- not make any use of capabilities except through the normal POSIX
- API, and thus there are no capabilities except those that the GNU C
- library uses itself, which \texttt{exec()} can take care of.
- However, this is only true if code that is not Hurd-aware is never
- mixed with Hurd specific code, even libraries (unless the library
- intimately cooperates with the GNU C library). This would be a high
- barrier to enable Hurd features in otherwise portable programs and
- libraries.
-
- It is better to make all POSIX functions safe by default and allow
- for extensions to let the user specify which capabilities besides
- those used for file descriptors etc to be inherited by the new
- executable.
-
- For \verb/posix_spawn()/, this is straight-forward. For
- \texttt{exec()}, it is not. either specific capabilities could be
- markes as ``do not close on \texttt{exec()}'', or variants of the
- \texttt{exec()} function could be provided which take further
- arguments.
-\end{comment}
-
-There are also implementation obstacles hindering the reuse of the
-existing task. Only local threads can manipulate the virtual memory
-mappings, and there is a lot of local state that has to be kept
-somewhere between the time the old program becomes defunct and the new
-binary image is installed and used (not to speak of the actual program
-snippet that runs during the transition).
-
-So the decision was made to always create a new task with
-\texttt{exec()}, and copy the desired state from the current task to
-the new task. This is a clean solution, because a new task will
-always start out without any capabilities in servers, etc, and thus
-there is no need for the old task to try to destroy all unneeded
-capabilities and other local state before \texttt{exec()}. Also, in
-case the exec fails, the old program can continue to run, even if the
-exec fails at a very late point (there is no ``point of no return''
-until the new task is actually up and running).
-
-For suid and sgid applications, the actual \texttt{exec()} has to be
-done by the filesystem. However, the filesystem can not be bothered
-to also transfer all the user state into the new task. It can not
-even do that, because it can not accept capabilities implemented by
-untrusted servers from the user. Also, the filesystem does not want
-to rely on the new task to be cooperative, because it does not
-necessarily trust the code, if is is owned by an untrusted user.
-
-\begin{enumerate}
-\item The user creates a new task and a container with a single
- physical page, and makes the \texttt{exec()} call to the file
- capability, providing the task control capability. Before that, it
- creates a task info capability from it for its own use.
-\item The filesystem checks permission and then revokes all other
- users on the task control capability. This will revoke the users
- access to the task, and will fail if the user did not provide a
- pristine task object. (It is assumed that the filesystem should not
- create the task itself so the user can not use suid/sgid
- applications to escape from their quota restriction).
-\item Then it revokes access to the provided physical page and writes
- a trusted startup code to it.
-\item The filesystem will also prepare all capability transactions and
- write the required information (together with other useful
- information) in a stack on the physical page.
-\item Then it creates a thread in the task, and starts it. At
- pagefault, it will provide the physical page.
-\item The startup code on the physical page completes the capability
- transfer. It will also install a small pager that can install file
- mappings for this binary image. Then it jumps to the entry point.
-\item The filesystem in the meanwhile has done all it can do to help
- the task startup. It will provide the content of the binary or
- script via paging or file reads, but that happens asynchronously,
- and as for any other task. So the filesystem returns to the client.
-\item The client can then send its untrusted information to the new
- task. The new task got the client's thread ID from the filesystem
- (possibly provided by the client), and thus knows to which thread it
- should listen. The new task will not trust this information
- ultimatively (ie, the new task will use the authentication, root
- directory and other capabilities it got from the filesystem), but it
- will accept all capabilities and make proper use of them.
-\item Then the new task will send a message to proc to take over the
- old PID and other process state. How this can be done best is still
- to be determined (likely the old task will provide a process control
- capability to the new task). At that moment, the old task is
- desrtoyed by the proc server.
-\end{enumerate}
-
-This is a coarse and incomplete description, but it shows the general
-idea. The details will depend a lot on the actual implementation.
-
-
-\subsection{Unix Domain Sockets}
-\label{unixdomainsockets}
-
-In the Hurd on Mach, there was a global pflocal server that provided
-unix domain sockets and pipes to all users. This will not work very
-well in the Hurd on L4, because for descriptor passing, read:
-capability passing, the unix domain socket server needs to accept
-capabilities in transit. User capabilities are often implemented by
-untrusted servers, though, and thus a global pflocal server running as
-root can not accept them.
-
-However, unix domain sockets and pipes can not be implemented locally
-in the task. An external task is needed to hold buffered data
-capabilities in transit. in theory, a new task could be used for
-every pipe or unix domain socketpair. However, in practice, one
-server for each user would suffice and perform better.
-
-This works, because access to Unix Domain Sockets is controlled via
-the filesystem, and access to pipes is controlled via file
-descriptors, usually by inheritance. For example, if a fifo is
-installed as a passive translator in the filesystem, the first user
-accessing it will create a pipe in his pflocal server. From then on,
-an active translator must be installed in the node that redirects any
-other users to the right pflocal server implementing this fifo. This
-is asymmetrical in that the first user to access a fifo will implement
-it, and thus pay the costs for it. But it does not seem to cause any
-particular problems in implementing the POSIX semantics.
-
-The GNU C library can contact ~/servers/socket/pflocal to implement
-socketpair, or start a pflocal server for this task's exclusive use if
-that node does not exist.
-
-All this are optimizations: It should work to have one pflocal process
-for each socketpair. However, performance should be better with a
-shared pflocal server, one per user.
-
-
-\subsection{Pipes}
-
-Pipes are implemented using \texttt{socketpair()}, that means as
-unnamed pair of Unix Domain Sockets. The \texttt{pflocal} server will
-support this by implementing pipe semantics on the socketpair if
-requested.
-
-\begin{comment}
- It was considered to use shared memory for the pipe implementation.
- But we are not aware of a lock-free protocol using shared memory
- with multiple readers and multiple writers. It might be possible,
- but it is not obvious if that would be faster: Pipes are normally
- used with \texttt{read()} and \texttt{write()}, so the data has to
- be copied from and to the supplied buffer. This can be done
- efficiently in L4 even across address spaces using string items. In
- the implementation using sockets, the \texttt{pflocal} server
- handles concurrent read and write accesses with mutual exclusion.
-\end{comment}
-
-
-\subsection{Filesystems}
-
-\subsubsection{Directory lookup across filesystems}
-\label{xfslookup}
-
-The Hurd has the ability to let users mount filesystems and other
-servers providing a filesystem-like interface. Such filesystem
-servers are called translators. In the Hurd on GNU Mach, the parent
-filesystem would automatically start up such translators from passive
-translator settings in the inode. It would then block until the child
-filesystem sends a message to its bootstrap port (provided by the
-parent fs) with its root directory port. This root directory port can
-then be given to any client looking up the translated node.
-
-There are several things wrong with this scheme, which becomes
-apparent in the Hurd on L4. The parent filesystem must be careful to
-not block on creating the child filesystem task. It must also be
-careful to not block on receiving any acknowledgement or startup
-message from it. Furthermore, it can not accept the root directory
-capability from the child filesystem and forward it to clients, as
-they are potentially not trusted.
-
-The latter problem can be solved the following way: The filesystem
-knows about the server thread in the child filesystem. It also
-implements an authentication capability that represents the ability to
-access the child filesystem. This capability is also given to the
-child filesystem at startup (or when it attaches itself to the parent
-filesystem). On client dir\_lookup, the parent filesystem can return
-the server\_thread and the authentication capability to the client.
-The client can use that to initiate a connection with the child
-filesystem (by first building up a connection, then sending the
-authentication capability from the parent filesystem, and receiving a
-root directory capability in exchange).
-
-\begin{comment}
- There is a race here. If the child filesystem dies and the parent
- filesystem processes the task death notification and releases the
- task info cap for the child before the user acquires its own task
- info cap for the child, then an imposter might be able to pretend to
- be the child filesystem for the client.
-
- This race can only be avoided by a more complex protocol:
-
- Variant 1: The user has to acquire the task info cap for the child
- fs, and then it has to perform the lookup again. If then the thread
- ID is for the task it got the task ID for in advance, it can go on.
- If not, it has to retry. This is not so good because a directory
- lookup is usually an expensive operation. However, it has the
- advantage of only slowing down the rare case.
-
- Variant 2: The client creates an empty reference container in the
- task server, which can then be used by the server to fill in a
- reference to the child's task ID. However, the client has to create
- and destroy such a container for every filesystem where it excepts
- it could be redirected to another (that means: for all filesystems
- for which it does not use \verb/O_NOTRANS/). This is quite an
- overhead to the common case.
-
-\begin{verbatim}
-<marcus> I have another idea
-<marcus> the client does not give a container
-<marcus> server sees child fs, no container -> returns O_NOTRANS node
-<marcus> then client sees error, uses O_NOTRANS node, "" and container
-<marcus> problem solved
-<marcus> this seems to be the optimum
-<neal> hmm.
-<neal> So lazily supply a container.
-<marcus> yeah
-<neal> Hoping you won't need one.
-<marcus> and the server helps you by doing as much as it can usefully
-<neal> And that is the normal case.
-<neal> Yeah, that seems reasonable.
-<marcus> the trick is that the server won't fail completely
-<marcus> it will give you at least the underlying node
-\end{verbatim}
-\end{comment}
-
-The actual creation of the child filesystem can be performed much like
-a suid exec, just without any client to follow up with further
-capabilities and startup info. The only problem that remains is how
-the parent filesystem can know which thread in the child filesystem
-implements the initial handshake protocol for the clients to use. The
-only safe way here seems to be that the parent filesystem requires the
-child to use the main thread for that, or that the parent filesystem
-creates a second thread in the child at startup (passing its thread ID
-in the startup data), requiring that this second thread is used. In
-either case the parent filesystem will know the thread ID in advance
-because it created the thread in the first place. This looks a bit
-ugly, and violates good taste, so we might try to look for alternative
-solutions.
-
-
-\subsubsection{Reparenting}
-\label{reparenting}
-
-The Hurd on Mach contains a curious RPC, \verb/file_reparent/, which
-allows you to create a new capability for the same node, with the
-difference that the new node will have a supplied capability as its
-parent node. A directory lookup of \texttt{..} on this new capability
-would return the provided parent capability.
-
-This function is used by the \texttt{chroot()} function, which sets
-the parent node to the null capability to prevent escape from a
-\texttt{chroot()} environment. It is also used by the
-\texttt{firmlink} translator, which is a cross over of a symbolic and
-a hard link: It works like a hard link, but can be used across
-filesystems.
-
-A firmlink is a dangerous thing. Because the filesystem will give no
-indication if the parent node it returns is provided by itself or some
-other, possibly untrusted filesystem, the user might follow the parent
-node to untrusted filesystems without being aware of it.
-
-In the Hurd port to L4, the filesystem can not accept untrusted parent
-capabilities on behalf of the user anymore. The \texttt{chroot()}
-function is not difficult to implement anyway, as no real capability
-is required. The server can just be instructed to create a node with
-no parent node, and it can do that without problems. Nevertheless, we
-also want a secure version of the \texttt{firmlink} translator. This
-is possible if the same strategy is used as in cross filesystem
-lookups. The client registers a server thread as the handler for the
-parent node, and the filesystem returns a capability that can be used
-for authentication purposes. Now, the client still needs to connect
-this to the new parent node. Normally, the filesystem providing the
-new parent node will also not trust the other filesystem, and thus can
-not accept the capability that should be used for authentication
-purposes. So instead creating a direct link from the one filesystem
-to the other, the firmlink translator must act as a middle man, and
-redirect all accesses to the parent node first to itself, and then to
-the filesystem providing the parent node. For this, it must request a
-capability from that filesystem that can be used for authentication
-purposes when bootstrapping a connection, that allows such a
-bootstrapping client to access the parent node directly.
-
-This also fixes the security issues, because now any move away from
-the filesystem providing the reparented node will explicitely go first
-to the \texttt{firmlink} translator, and then to the filesystem
-providing the parent node. The user can thus make an informed
-decision if it trusts the \texttt{firmlink} translator and the
-filesystem providing the parent node.
-
-\begin{comment}
- This is a good example where the redesign of the IPC system forces
- us to fix a security issue and provides a deeper insight into the
- trust issues and how to solve them.
-\end{comment}
-
-
-\section{Debugging}
-\label{debug}
-
-L4 does not support debugging. So every task has to implement a debug
-interface and implement debugging locally. gdb needs to be changed to
-make use of this interface. How to perform the required
-authentication, and how the debug thread is advertised to gdb, and how
-the debug interface should look like, are all open questions.
-
-
-\section{Device Drivers}
-
-This section written by Peter De Schrijver and Daniel Wagner.
-
-\subsection{Requirements}
-
- \begin{itemize}
- \item Performance: Speed is important!
- \item Portability: Framework should work on different architectures.
-
- Also: Useable in a not hurdisch environment with only
- small changes.
-
- \item Flexibility
- \item Convenient interfaces
- \item Consistency
- \item Safety: driver failure should have as minimal system impact as
- possible.
- \end{itemize}
-
-\subsection{Overview}
-
- The framework consists of:
- \begin{itemize}
- \item Bus drivers
- \item Device drivers
- \item Service servers (plugin managers, $\omega_0$, rootserver)
- \end{itemize}
-
-\subsubsection{Drivers and the filesystem}
-
- The device driver framework will only offer a physical device view.
- Ie. it will be a tree with devices as the leaves connected by
- various bus technologies. Any logical view and naming persistence
- will have to be build on top of this (translator).
-
-\subsubsection{Layer of the drivers}
-
- The device driver framework consists only of the lower level drivers
- and doesn't need to have a complicated scheme for access control.
- This is because it should be possible to share devices, e.g. for
- neighbour Hurd. The authentication is done by installing a virtual
- driver in each OS/neighour Hurd. The driver framework trusts these
- virtual drivers. So it's possible for a non Hurdish system to use
- the driver framework just by implementing these virtual drivers.
-
- Only threads which have registered as trusted are allowed to access
- device drivers. The check is simply done by checking the senders
- ID against a table of known threads.
-
-\subsubsection{Address spaces}
-
- Drivers always reside in their own AS. The overhead for cross AS IPC
- is small enough to do so.
-
-\subsubsection{Zero copying and DMA}
-
- It is assumed that there are no differences between physical memory
- pages. For example each physical memory page can be used for DMA
- transfers. Of course, older hardware like ISA devices can so not be
- supported. Who cares?
-
- With this assumption, the device driver framework can be given any
- physical memory page for DMA operation. This physical memory page
- must be pinned down.
-
- If an application wants to send or receive data to/from a device
- driver it has to tell the virtual driver the page on which the
- operation has to be executed. Since the application doesn't know
- the virtual-real memory mapping, it has to ask the physical memory
- manager for the real memory address of the page in question. If the
- page is not directly mapped from the physical memory manager the
- application ask the mapper (another application which has mapped
- this memory region the first application) to resolve the mapping.
- This can be done recursively. Normally, this resolving of mapping
- can be speed up using a cache services, since a small number of
- pages are reused very often.
-
- With the scheme, the drivers do not have to take special care of
- zero copying if there is only one virtual driver. When there is
- more than one virtual driver pages have to copied for all other
- virtual drivers.
-
-\subsubsection{Root bus driver}
-
- The root bus is the entrypoint to look up devices.
-
- XXX There should be iterators/visitors for operating on
- busses/devices. (daniel)
-
-\subsubsection{Physical versus logical device view}
-
- The device driver framework will only offer a physical device view.
- Ie. it will be a tree with devices as the leaves connected by
- various bus technologies. Any logical view and naming persistence
- will have to be build on top of this (translator).
-
-\subsubsection{Things for the future}
-
- \begin{itemize}
- \item Interaction with the task server (e.g. listings driver threads
- with ps,etc.)
- \item Powermanagement
- \end{itemize}
-
-\subsection{Bus Drivers}
-
-A bus driver is responsible to manage the bus and provide access to
-devices connected to it. In practice it means a bus driver has to
-perform the following tasks:
-
-\begin{itemize}
-\item Handle hotplug events
-
- Busses which do not support hotplugging, will treated as if there is
- 1 insertion event for every device connected to it when the bus
- driver is started. Drivers which don't support autoprobing of
- devices will probably have to read some configuration data from a
- file or if the driver is a needed for bootstrapping configuration
- can be given as argument on its stack. In some cases the bus
- doesn't generate insertion/removal events, but can still support
- some form of hotplug functionality if the user tells the driver when
- a change to the bus configuration has happened (eg. SCSI).
-
-\item Configure client device drivers
-
- The bus driver should start the appropriate client device driver
- translator when an insertion event is detected. It should also
- provide the client device driver with all necessary configuration
- info, so it can access the device it needs. This configuration data
- typically consists of the bus addresses of the device and possibly
- IRQ numbers or DMA channel ID's. The device driver is loaded by the
- assotiatet plugin manager.
-
-\item Provide access to devices
-
- This means the bus driver should be able to perform a bus
- transaction on behalf of a client device driver. In some cases this
- involves sending a message and waiting for reply (eg. SCSI, USB,
- IEEE 1394, Fibre Channel,...). The driver should provide
- send/receive message primitives in this case. In other cases
- devices on the bus can be accessed by doing a memory accesses or by
- using special I/O instructions. In this case the driver should
- provide mapping and unmapping primitives so a client device driver
- can get access to the memory range or is allowed to access the I/O
- addresses. The client device driver should use a library, which is
- bus dependant, to access the device on the bus. This library hides
- the platform specific details of accessing the bus.
-
- Furthermore the bus driver must also support rescans for hardware.
- It might be that not all drivers are found during bootstrapping and
- hence later on drivers could be loaded. This is done by regenerate
- new attach notification sending to bus's plugin manager. The plugin
- manager loads then if possible a new driver. A probe funtion is not
- needed since all supported hardware can be identified by
- vendor/device identifactions (unlike ISA hardware). For hardware
- busses which don't support such identifaction (ISA) only static
- configuration is possible (configuration scripts etc.)
-\end{itemize}
-
-
-\subsubsection{Plugin Manager}
-
- Each bus driver has a handle/reference to which insert/remove events
- are send. The owner of the handle/refence must then take
- appropriate action like loading the drivers. These actors are
- called plugin managers.
-
-\subsubsection{Generic Bus Driver}
-
- Operations:
- \begin{itemize}
- \item notify (attach, detach)
- \item string enumerate
- \end{itemize}
-
- XXX Extract generic bus services from the PCI Bus Driver section
- which could be also be used other PCI related busses (ISA) be used.
- The name for this service is missleading, since a SCSI Bus Driver
- does not have anything in common with a PCI bus. (daniel)
-
-\subsubsection{ISA Bus Driver}
-Inherits from:
-
-\begin{itemize}
-\item Generic Bus Driver
-\end{itemize}
-
-Operations:
-\begin{itemize}
-\item (none)
-\end{itemize}
-
-XXX The interface has not been defined up to now. (daniel)
-
-
-\subsubsection{PCI Bus Driver}
-
-Inherits from:
-\begin{itemize}
-\item Generic Bus Driver
-\end{itemize}
-
-Operations:
-\begin{itemize}
-\item map\_mmio: map a PCI BAR for MMIO
-\item map\_io: map a PCI BAR for I/O
-\item map\_mem: map a PCI BAR for memory
-\item read\_mmio\_{8,16,32,64}: read from a MMIO register
-\item write\_mmio\_{8,16,32,64}: write to a MMIO register
-\item read\_io\_{8,16,32,64}: read from an IO register
-\item write\_io\_{8,16,32,64}: write to an IO register
-\item read\_config\_{8,16,32,?}: read from a PCI config register
-\item write\_config\_{8,16,32,?}: write to a PCI config register
-\item alloc\_dma\_mem(for non zero copying): allocate main memory useable for DMA
-\item free\_dma\_mem (for non zero copying): free main memory useable for DMA
-\item prepare\_dma\_read: write back CPU cachelines for DMAable memory area
-\item sync\_dma\_write: discard CPU cachelines for DMAable memory area
-\item alloc\_consistent\_mem: allocate memory which is consistent between CPU
- and device
-\item free\_consistent\_mem: free memory which
- is consistent between CPU and device
-\item get\_irq\_mapping (A,B,C,D): get the IRQ matching the INT(A,B,C,D) line
-\end{itemize}
-
-\subsection{Device Drivers}
-\subsubsection{Classes}
-\begin{itemize}
-\item character: This the standard tty as known in the Unix environment.
-\item block
-\item human input: Keyboard, mouse, ...
-\item packet switched network
-\item circuit switched network
-\item framebuffer
-\item streaming audio
-\item streaming video
-\item solid state storage: flash memory
-\end{itemize}
-
-\subsubsection{Human input devices (HID) and the console}
-
-The HIDs and the console are critical for user interaction with the
-system. Furthmore, the console should be working as soons as possible
-to give feedback. Log messages which are send to the console before
-the hardware has been initialized should be buffered.
-
-\subsubsection{Generic Device Driver}
-Operations:
-\begin{itemize}
-\item init : prepare hardware for use
-\item start : start normal operation
-\item stop : stop normal operation
-\item deinit : shutdown hardware
-\item change\_irq\_peer : change peer thread to propagate irq message to.
-\end{itemize}
-
-
-\subsubsection{ISA Devices}
-Inherits from:
-\begin{itemize}
-\item Generic Device Driver
-\end{itemize}
-
-Supported devices
-\begin{itemize}
-\item Keyboard (ps2)
-\item serial port (mainly for debugging purposses)
-\item parallel port
-\end{itemize}
-
-XXX interface definition for each device driver is missing. (daniel)
-
-
-\subsubsection{PCI Devices}
-Inherits from:
-\begin{itemize}
-\item Generic Device Driver
-\end{itemize}
-
-Supported devices:
-\begin{itemize}
-\item block devices
-\item ...
-\end{itemize}
-
-XXX interface definition for each device driver is missing. (daniel)
-
-
-\subsection{Resource Management}
-
-
-\subsubsection{IRQ handling}
-
-\paragraph{IRQ based interrupt vectors}
-
-Some CPU architectures (eg 68k, IA32) can directly jump to an
-interrupt vector depending on the IRQ number. This is typically the
-case on CISC CPU's. In this case there is some priorization scheme. On
-IA32 for example, the lowest IRQ number has the highest priority.
-Sometimes the priorities are programmable. Most RISC CPU's have only
-a few interrupt vectors which are connected external IRQs. (typically
-1 or 2). This means the IRQ handler should read a register in the
-interrupt controller to determine which IRQ handler has to be
-executed. Sometimes the hardware assists here by providing a register
-which indicates the highest priority interrupt according to some
-(programmable) scheme.
-
-\paragraph{IRQ acknowlegdement}
-
-The IRQ acknowledgement is done in two steps. First inform the
-hardware about the successful IRQ acceptance. Then inform the ISRs
-about the IRQ event.
-
-\paragraph{Edge versus level triggered IRQs}
-
-Edge triggered IRQs typically don't need explicit acknowledgment by
-the CPU at the device level. You can just acknowledge them at the
-interrupt controller level. Level triggered IRQs typically need to
-explicitly acknowledged by the CPU at the device level. The CPU has to
-read or write a register from the IRQ generating peripheral to make
-the IRQ go away. If this is not done, the IRQ handler will be
-reentered immediatly after it ended, effectively creating an endless
-loop. Another way of preventing this would be to mask the IRQ.
-
-\paragraph{Multiple interrupt controllers}
-
-Some systems have multiple interrupt controllers in cascade. This is
-for example the case on a PC, where you have 2 8259 interrupt
-controllers. The second controller is connected to the IRQ 2 pin of
-the first controller. It is also common in non PC systems which still
-use some standard PC components such as a Super IO controller. In this
-case the 2 8259's are connected to 1 pin of the primary interrupt
-controller. Important for the software here is that you need to
-acknowledge IRQ's at each controller. So to acknowledge an IRQ from
-the second 8259 connected to the first 8259 connected to another
-interrupt controller, you have to give an ACK command to each of those
-controllers. Another import fact is that on PC architecture the order
-of the ACKs is important.
-
-\paragraph{Shared IRQs}
-
-Some systems have shared IRQs. In this case the IRQ handler has to
-look at all devices using the same IRQ...
-
-\paragraph{IRQ priorities}
-
-All IRQs on L4 have priorities, so if an IRQ occurs any IRQ lower then
-the first IRQ will be blocked until the first IRQ has been
-acknowlegded. ISR priorities must much the hardware priority (danger
-of priority inversion). Furthermore the IRQ acknowledgment order is
-important.
-
-The 8259 also supports a specific IRQ acknowledge iirc. But, this
-scheme does not work in most level triggered IRQ environments. In
-these environments you must acknowledge (or mask) the IRQ before
-leaving the IRQ handler, otherwise the CPU will immediately reenter
-the IRQ handler, effectively creating an endless loop. In this case L4
-would have to mask the IRQ. The IRQ thread would have to unmask it
-after acknowledgement and processing.
-
-\paragraph{IRQ handling by L4/x86}
-
-The L4 kernel does handle IRQ acknowlegdment.
-
-
-\subsubsection{$\omega_0$}
-
-$\omega_0$ is a system-central IRQ-logic server. It runs in the
-privileged AS space in order to be allowed rerouting IRQ IPC.
-
-If an IRQ is shared between several devices, the drivers are daisy
-chained and have to notify their peers if an IRQ IPC has arrived.
-
-XXX For more detail see XXX URL missing
-
-Operations:
-\begin{itemize}
-\item attach\_irq : attach an ISR thread to the IRQ
-\item detach\_irq : detach an ISR thread form the IRQ
-\end{itemize}
-
-
-\subsubsection{Memory}
-If no physical memory pages are provided by the OS the device driver
-framework alloces pages from the physical memory manager. The device
-driver framework has at no point of time to handle any virtual to
-physical page mapping.
-
-
-\subsection{Bootstrapping}
-
-A simpleFS provides initial drivers for bootstraping. The root bus
-driver and simpleFS is loaded by grub as module. It then signals for
-loading new (bus) drivers. As before if there is no driver avaible
-for some reason for the device, the bus driver doesn't change the
-device state and waits for a notifaction that there are new drivers
-avaible. This simpleFS might be based on BSD libstand (library for
-standalone applications). simpleFS doesn't need to be writeable
-either.
-
-
-\subsubsection{Plugin Manager}
-A Plugin manager handles driver loading for devices. It searches for
-driver in seach pathes (on filesystems). It's possible to add new
-search pathes later. This allows the system to bootstrap with only
-one search path (the simpleFS). When the search path is changed, the
-device tree will be scanned for devices which don't have a driver
-loaded yet. If a driver has become available, it will be loaded.
-
-
-\subsection{Order of implementation}
-
-\begin{enumerate}
-\item rootserver, plugin server
-\item root bus server
-\item pci bus
-\item isa bus
-\item serial port (isa bus)
-\item console
-\end{enumerate}
-
+\include{introduction}
+\include{booting}
+\include{ipc}
+\include{threads-tasks}
+\include{vmm}
+\include{authentication}
+\include{posix}
+\include{debugging}
+\include{device-drivers}
\end{document}
diff --git a/doc/introduction.tex b/doc/introduction.tex
new file mode 100644
index 0000000..57a28f8
--- /dev/null
+++ b/doc/introduction.tex
@@ -0,0 +1,44 @@
+\chapter{Introduction}
+
+The GNU Hurd is a multi-server operating system running on top of a
+microkernel (currently Mach variants). The core motivation of the
+Hurd is the following:
+
+\begin{quote}
+ \emph{The operating system should enable its users to share the
+ resources of the system without harming each other.}
+\end{quote}
+
+The focus is on the user, the system should try to allow the user to
+do anything that is not harmful for other users. Many operating
+systems either restrict what the user can do to be more secure, while
+others allow the user to do everything, but fail on protecting the
+users from each other effectively.
+
+The Hurd is designed to minimize the system code that the user is
+required to use, while allowing the user to use, ignore or replace the
+remaining system code, and this without harming other users.
+
+So while the L4 microkernel tries to minimize the policy that the
+kernel enforces on the software running on it, the Hurd tries to
+minimize the policy that the operating system enforces on its users.
+Furthermore, the Hurd also aims to provide a POSIX compatible general
+purpose operating system. However, this POSIX personality of the Hurd
+is provided for convenience only, and to make the Hurd useful. Other
+personalities can be implemented and used by the users of the system
+along with the POSIX personality. This default personality of the
+Hurd also provides some convenient features that allow the user to
+extend the system so that all POSIX compatible programs can take
+advantage of it.
+
+These notes are a moving target in the effort to find the best
+strategy to port the Hurd to the L4 microkernel.
+
+\begin{comment}
+ Remarks about the history of a certain feature and implementation
+ details are set in a smaller font and separated from the main text,
+ just like this paragraph. Because this is work in progress, there
+ are naturally a lot of such comments.
+\end{comment}
+
+
diff --git a/doc/ipc.tex b/doc/ipc.tex
new file mode 100644
index 0000000..522faf5
--- /dev/null
+++ b/doc/ipc.tex
@@ -0,0 +1,1126 @@
+\chapter{Inter-process communication (IPC)}
+\label{ipc}
+
+The Hurd requires a capability system. Capabilities are used to proof
+your identity to other servers (authentication), and access
+server-side implemented objects like devices, files, directories,
+terminals, and other things. The server can use a capability for
+whatever it wants. Capabilities provide interfaces. Interfaces can
+be invoked by sending messages to the capability. In L4, this means
+that a message is sent to a thread in the server providing the
+capability, with the identifier for the capability in the message.
+
+Capabilities are protected objects. Access to a capability needs to
+be granted by the server. Once you have a capability, you can copy it
+to other tasks (if the server permits it, which is usually the case).
+In the Hurd, access to capabilities is always granted to a whole task,
+not to individual threads.
+
+\begin{comment}
+ There is no reason for the server not to permit it, because the
+ holder of the capability could also just act as a proxy for the
+ intended receiver instead copying the capability to it. The
+ operation might fail anyway, for example because of resource
+ shortage, in particular if the server puts a quota on the number of
+ capabilities a user can hold.
+\end{comment}
+
+Capabilities provide two essential services to the Hurd. They are
+used to restrict access to a server function, and they are the
+standard interface the components in the Hurd use to communicate with
+each others. Thus, it is important that their implementation is fast
+and secure.
+
+\begin{comment}
+ There are several ways to implement such a capability system. A
+ more traditional design would be a global, trusted capability server
+ that provides capabilities to all its users. The L4 redirector
+ could be used to reroute all client traffic automatically through
+ this server. This approach has several disadvantages:
+
+ \begin{itemize}
+ \item It adds a lot of overhead to every single RPC, because all
+ traffic has to be routed through the capability server, which must
+ then perform the authentication on the server's behalf.
+ \item It would be difficult to copy a capability to another task.
+ Either the cap server would have to provide interfaces for clients
+ to do it, or it would be have to know the message format for every
+ interface and do it automatically.
+ \item It would be a single point of failure. If it had a bug and
+ crashed, the whole system would be affected.
+ \item Users could not avoid it, it would be enforced system code.
+ \item It is inflexible. It would be hard to replace or extend at
+ run-time.
+ \end{itemize}
+
+ Another approach is taken by CORBA with IORs. IORs contain long
+ random numbers which allow the server to identify a user of an
+ object. This approach is not feasible for the following reasons:
+
+ \begin{itemize}
+ \item Even good random numbers can be guessed. Long enough random
+ numbers can reduce the likelihood to arbitrary small numbers,
+ though (below the probability of a hardware failure).
+ \item Good random numbers are in short supply, and is slow to
+ generate. Good pseudo random is faster, but it is still difficult
+ to generate. The random number generator would become a critical
+ part of the operating system.
+ \item The random number had to be transfered in every single
+ message. Because it would have to be long, it would have a
+ significant negative impact on IPC performance.
+ \end{itemize}
+\end{comment}
+
+The Hurd implements the capability system locally in each task. A
+common default implementation will be shared by all programs.
+However, a malicious untrusted program can do nothing to disturb the
+communication of other tasks. A capability is identified in the
+client task by the server thread and a local identifier (which can be
+different from client to client). The server thread will receive
+messages for the capabilities. The first argument in the message is
+the capability identifier. Although every task can get different IDs
+for the same capability, a well-behaving server will give the same ID
+to a client which already has a capability and gets the same
+capability from another client. So clients can compare capability IDs
+from the server numerically to check if two capabilities are the same,
+but only if one of the two IDs is received while the client already
+had the other one.
+
+Because access to a capability must be restricted, the server needs to
+be careful in only allowing registered and known users to access the
+capability. For this, the server must be sure that it can determine
+the sender of a message. In L4, this is easy on the surface: The
+kernel provides the receiving thread with the sender's thread ID,
+which also contains the task ID in the version field. However, the
+server must also know for sure if this task is the same task that it
+gave access to the capability. Comparing the task IDs numerically is
+not good enough, the server must also somehow have knowledge or
+influence on how task IDs are reused when tasks die and are created.
+
+The same is true for the client, of course, which trusts the server
+and thus must be sure that it is not tricked into trusting on
+unreliable data from an imposter, or sends sensitive data to it.
+
+\begin{comment}
+ The \texttt{task} server wants to reuse thread numbers because that
+ makes best use of kernel memory. Reusing task IDs, the version
+ field of a thread ID, is not so important, but there are only 14
+ bits for the version field (and the lower six bits must not be all
+ zero). So a thread ID is bound to be reused eventually.
+
+ Using the version field in a thread ID as a generation number is not
+ good enough, because it is so small. Even on 64-bit architectures,
+ where it is 32 bit long, it can eventually overflow.
+\end{comment}
+
+The best way to prevent that a task can be tricked into talking to an
+imposter is to have the \texttt{task} server notify the task if the
+communication partner dies. The \texttt{task} server must guarantee
+that the task ID is not reused until all tasks that got such a
+notification acknowledge that it is processed, and thus no danger of
+confusion exists anymore.
+
+The \texttt{task} server provides references to task IDs in form of
+\emph{task info capabilities}. If a task has a task info capability
+for another task, it prevents that this other task's task ID is reused
+even if that task dies, and it also makes sure that task death
+notifications are delivered in that case.
+
+\begin{comment}
+ Because only the \texttt{task} server can create and destroy tasks,
+ and assign task IDs, there is no need to hold such task info
+ capabilities for the \texttt{task} server, nor does the
+ \texttt{task} server need to hold task info capabilities for its
+ clients. This avoids the obvious bootstrap problem in providing
+ capabilities in the \texttt{task} server. This will even work if
+ the \texttt{task} server is not the real \texttt{task} server, but a
+ proxy task server (see section \ref{proxytaskserver} on page
+ \pageref{proxytaskserver}).
+\end{comment}
+
+As task IDs are a global resource, care has to be taken that this
+approach does not allow for a DoS-attack by exhausting the task ID
+number space, see section \ref{taskinfocap} on page
+\pageref{taskinfocap} for more details.
+
+
+\section{Capabilities}
+
+This subsection contains implementation details about capabilities.
+
+A server will usually operate on objects, and not capabilities. In
+the case of a filesystem, this could be file objects, for example.
+
+\begin{comment}
+ In the Hurd, filesystem servers have to keep different objects for
+ each time a file is looked up (or ``opened''), because some state,
+ for example authentication, open flags and record locks, are
+ associated not with the file directly, but with this instance of
+ opening the file. Such a state structure (``credential'') will also
+ contain a pointer and reference to the actual file node. For
+ simplicity, we will assume that the capability is associated with a
+ file node directly.
+\end{comment}
+
+To provide access to the object to another task, the server creates a
+capability, and associates it with the object (by setting a hook
+variable in the capability). From this capability, the server can
+either create send references to itself, or to other tasks. If the
+server creates send references for itself, it can use the capability
+just as it can use capabilities implemented by other servers. This
+makes access to locally and remotely implemented capabilities
+identical. If you write code to work on capabilities, it can be used
+for remote objects as well as for local objects.
+
+If the server creates a send reference for another task (a client), a
+new capability ID will be created for this task. This ID will only be
+valid for this task, and should be returned to the client.
+
+The client itself will create a capability object from this capability
+ID. The capability will also contain information about the server,
+for example the server thread which should be used for sending
+messages to the capability.
+
+If the client wants to send a message, it will send it to the provided
+server thread, and use the capability ID it got from the server as the
+first argument in the RPC. The server receives the message, and now
+has to look up the capability ID in the list of capabilties for this
+task.
+
+\begin{comment}
+ The server knows the task ID from the version field of the sender's
+ thread ID. It can look up the list of capabilities for this task in
+ a hash table. The capability ID can be an index into an array, so
+ the server only needs to perform a range check. This allows to
+ verify quickly that the user is allowed to access the object.
+
+ This is not enough if several systems run in parallel on the same
+ host. Then the version ID for the threads in the other systems will
+ not be under the control of the Hurd's \texttt{task} server, and can
+ thus not be trusted. The server can still use the version field to
+ find out the task ID, which will be correct \emph{if the thread is
+ part of the same subsystem}. It also has to verify that the
+ thread belongs to this subsystem. Hopefully the subsystem will be
+ encoded in the thread ID. Otherwise, the \texttt{task} server has
+ to be consulted (and, assuming that thread numbers are not shared by
+ the different systems, the result can be cached).
+\end{comment}
+
+The server reads out the capability associated with the capability ID,
+and invokes the server stub according to the message ID field in the
+message.
+
+After the message is processed, the server sends it reply to the
+sender thread with a zero timeout.
+
+\begin{comment}
+ Servers must never block on sending messages to clients. Even a
+ small timeout can be used for DoS-attacks. The client can always
+ make sure that it receives the reply by using a combined send and
+ receive operation together with an infinite timeout.
+\end{comment}
+
+The above scheme assumes that the server and the client already have
+task info caps for the respective other task. This is the normal
+case, because acquiring these task info caps is part of the protocol
+that is used when a capability is copied from one task to another.
+
+
+\subsection{Bootstrapping a client-server connection}
+\label{ipcbootstrap}
+
+If the client and the server do not know about each other yet, then
+they can bootstrap a connection without support from any other task
+except the \texttt{task} server. The purpose of the initial handshake
+is to give both participants a chance to acquire a task info cap for
+the other participants task ID, so they can be sure that from there on
+they will always talk to the same task as they talked to before.
+
+\subsubsection{Preconditions}
+The client knows the thread ID of the server thread that receives and
+processes the bootstrap messages. Some other task might hold a task
+info capability to the server the client wants to connect to.
+
+\begin{comment}
+ If no such other tasks exists, the protocol will still work.
+ However, the client might not get a connection to the server that
+ run at the time the client started the protocol, but rather to the
+ server that run at the time the client acquired the task info cap
+ for the server's task ID (after step 1 below).
+
+ This is similar to how sending signals works in Unix: Technically,
+ at the time you write \texttt{kill 203}, and press enter, you do not
+ know if the process with the PID 203 you thought of will receive the
+ signal, or some other process that got the PID in the time between
+ you getting the information about the PID and writing the
+ \texttt{kill}-command.
+\end{comment}
+
+FIXME: Here should be the pseudo code for the protocol. For now, you
+have to take it out of the long version.
+
+\begin{enumerate}
+
+\item The client acquires a task info capability for the server's task
+ ID, either directly from the \texttt{task} server, or from another
+ task in a capability copy. From that point on, the client can be
+ sure to always talk to the same task when talking to the server.
+
+ Of course, if the client already has a task info cap for the server
+ it does not need to do anything in this step.
+
+\begin{comment}
+ As explained above, if the client does not have any other task
+ holding the task info cap already, it has no secure information
+ about what this task is for which it got a task info cap.
+\end{comment}
+
+\item The client sends a message to the server, requesting the initial
+ handshake.
+
+\item The server receives the message, and acquires a task info cap
+ for the client task (directly from the \texttt{task} server).
+
+ Of course, if the server already has a task info cap for the client
+ it does not need to do anything in this step.
+
+\begin{comment}
+ At this point, the server knows that future messages from this task
+ will come from the same task as it got the task info cap for.
+ However, it does not know that this is the same task that sent the
+ initial handshake request in step 2 above. This shows that there is
+ no sense in verifying the task ID or perform any other
+ authentication before acquiring the task info cap.
+\end{comment}
+
+\item The server replies to the initial handshake request with an
+ empty reply message.
+
+\begin{comment}
+ Because the reply now can go to a different task than the request
+ came from, sending the reply might fail. It might also succeed and
+ be accepted by the task that replaced the requestor. Or it might
+ succeed normally. The important thing is that it does not matter to
+ the server at all. It would have provided the same ``service'' to
+ the ``imposter'' of the client, if he had bothered to do the
+ request. As no authentication is done yet, there is no point for
+ the server to bother.
+
+ This means however, that the server needs to be careful in not
+ consuming too many resources for this service. However, this is
+ easy to achieve. Only one task info cap per client task will ever
+ be held in the server. The server can either keep it around until
+ the task dies (and a task death notification is received), or it can
+ clean it up after some timeout if the client does not follow up and
+ do some real authentication.
+\end{comment}
+
+\item The client receives the reply message to its initial handshake
+ request.
+
+\item The client sends a request to create its initial capability.
+ How this request looks depends on the type of the server and the
+ initial capabilities it provides. Here are some examples:
+
+ \begin{itemize}
+ \item A filesystem might provide an unauthenticated root directory
+ object in return of the underlying node capability, which is
+ provided by the parent filesystem and proves to the filesystem
+ that the user was allowed to look up the root node of this
+ filesystem (see section \ref{xfslookup} on page
+ \pageref{xfslookup}).
+
+ \begin{comment}
+ In this example, the parent filesystem will either provide the
+ task info cap for the child filesystem to the user, or it will
+ hold the task info cap while the user is creating their own
+ (which the user has to verify by repeating the lookup, though).
+ Again, see section \ref{xfslookup} on page \pageref{xfslookup}.
+
+ The unauthenticated root directory object will then have the be
+ authenticated using the normal reauthentication mechanism (see
+ section \ref{auth} on pageref{auth}). This can also be combined
+ in a single RPC.
+ \end{comment}
+
+ \item Every process acts as a server that implements the signal
+ capability for this process. Tasks who want to send a signal to
+ another task can perform the above handshake, and then provide
+ some type of authentication capability that indicates that they
+ are allowed to send a signal. Different authentication
+ capabilities can be accepted by the signalled task for different
+ types of signals.
+
+ \begin{comment}
+ The Hurd used to store the signal capability in the proc server,
+ where authorized tasks could look it up. This is no longer
+ possible because a server can not accept capabilities
+ implemented by untrusted tasks, see below.
+ \end{comment}
+ \end{itemize}
+
+\item The server replies with whatever capability the client
+ requested, provided that the client could provide the necessary
+ authentication capabilities, if any.
+
+ \begin{comment}
+ It is not required that the server performs any authentication at
+ all, but it is recommended, and all Hurd servers will do so.
+
+ In particular, the server should normally only allow access from
+ tasks running in the same system, if running multiple systems on
+ the same host is possible.
+ \end{comment}
+\end{enumerate}
+
+\subsubsection{Result}
+The client has a task info capability for the server and an
+authenticated capability. The server has a task info capability for
+the client and seen some sort of authentication for the capability it
+gave to the client.
+
+\begin{comment}
+ If you think that the above protocol is complex, you have seen
+ nothing yet! Read on.
+\end{comment}
+
+
+\subsection{Returning a capability from a server to a client}
+
+Before we go on to the more complex case of copying a capability from
+one client to another, let us point out that once a client has a
+capability from a server, it is easy for the server to return more
+capabilities it implements to the client.
+
+The server just needs to create the capability, acquire a capability
+ID in the client's cap ID space, and return the information in the
+reply RPC.
+
+FIXME: Here should be the pseudo code for the protocol. For now, you
+have to take it out of the long version.
+
+\begin{comment}
+ The main point of this section is to point out that only one task
+ info capability is required to protect all capabilities provided to
+ a single task. The protocols described here always assume that no
+ task info caps are held by anyone (except those mentioned in the
+ preconditions). In reality, sometimes the required task info caps
+ will already be held.
+\end{comment}
+
+
+\subsection{Copying a capability from one client to another task}
+
+The most complex operation in managing capabilities is to copy or move
+a capability from the client to another task, which subsequently
+becomes a client of the server providing the capability. The
+difficulty here lies in the fact that the protocol should be fast, but
+also robust and secure. If any of the participants dies unexpectedly,
+or any of the untrusted participants is malicious, the others should
+not be harmed.
+
+\subsubsection{Preconditions}
+The client $C$ has a capability from server $S$ (this implies that $C$
+has a task info cap for $S$ and $S$ has a task info cap for $C$). It
+wants to copy the capability to the destination task $D$. For this,
+it will have to make RPCs to $D$, so $C$ has also a capability from
+$D$ (this implies that $C$ has a task info cap for $D$ and $D$ has a
+task info cap for $C$). Of course, the client $C$ trusts its servers
+$S$ and $D$. $D$ might trust $S$ or not, and thus accept or reject
+the capability that $C$ wants to give to $D$. $S$ does not trust
+either $C$ or $D$.
+
+The \texttt{task} server is also involved, because it provides the
+task info capabilities. Everyone trusts the \texttt{task} server they
+use. This does not need to be the same one for every participant.
+
+FIXME: Here should be the pseudo code for the protocol. For now, you
+have to take it out of the long version.
+
+\begin{enumerate}
+\item The client invokes the \verb/cap_ref_cont_create/ RPC on the
+ capability, providing the task ID of the intended receiver $D$ of
+ the capability.
+
+\item The server receives the \verb/cap_ref_cont_create/ RPC from the
+ client. It requests a task info cap for $D$ from its trusted task
+ server, under the constraint that $C$ is still living.
+
+ \begin{comment}
+ A task can provide a constraint when creating a task info cap in
+ the \texttt{task} server. The constraint is a task ID. The task
+ server will only create the task info cap and return it if the
+ task with the constraint task ID is not destroyed. This allows
+ for a task requesting a task info capability to make sure that
+ another task, which also holds this task info cap, is not
+ destroyed. This is important, because if a task is destroyed, all
+ the task info caps it held are released.
+
+ In this case, the server relies on the client to hold a task info
+ cap for $D$ until it established its own. See below for what can
+ go wrong if the server would not provide a constraint and both,
+ the client and the destination task would die unexpectedly.
+ \end{comment}
+
+ Now that the server established its own task info cap for $D$, it
+ creates a reference container for $D$, that has the following
+ properties:
+
+ \begin{itemize}
+ \item The reference container has a single new reference for the
+ capability.
+
+ \item The reference container has an ID that is unique among all
+ reference container IDs for the client $C$.
+
+ \item The reference container is associated with the client $C$. If
+ $C$ dies, and the server processes the task death notification for
+ it, the server will destroy the reference container and release
+ the capability reference it has (if any). All resources
+ associated with the reference container will be released. If this
+ reference container was the only reason for $S$ to hold the task
+ info cap for $D$, the server will also release the task info cap
+ for $D$.
+
+ \item The reference container is also associated with the
+ destination task $D$. If $D$ dies, and the server processes the
+ task death notification for it, the server will release the
+ capability reference that is in the reference container (if any).
+ It will not destroy the part of the container that is associated
+ with $C$.
+ \end{itemize}
+
+ The server returns the reference container ID $R$ to the client.
+
+\item The client receives the reference container ID $R$.
+
+ \begin{comment}
+ If several capabilities have to be copied in one message, the
+ above steps need to be repeated for each capability. With
+ appropriate interfaces, capabilities could be collected so that
+ only one call per server has to be made. We are assuming here
+ that only one capability is copied.
+ \end{comment}
+
+\item The client sends the server thread ID $T$ and the reference
+ container ID $R$ to the destination task $D$.
+
+\item The destination task $D$ receives the server thread ID $T$ and
+ the reference container ID $R$ from $C$.
+
+ It now inspects the server thread ID $T$, and in particular the task
+ ID component of it. $D$ has to make the decision if it trusts this
+ task to be a server for it, or if it does not trust this task.
+
+ If $D$ trusts $C$, it might decide to always trust $T$, too,
+ irregardless of what task contains $T$.
+
+ If $D$ does not trust $C$, it might be more picky about the task
+ that contains $T$. This is because $D$ will have to become a client
+ of $T$, so it will trust it. For example, it will block on messages
+ it sends to $T$.
+
+ \begin{comment}
+ If $D$ is a server, it will usually only accept capabilities from
+ its client that are provided by specific other servers it trusts.
+ This can be the authentication server, for example (see section
+ \ref{auth} on page \pageref{auth}).
+
+ Usually, the type of capability that $D$ wants to accept from $C$
+ is then further restricted, and only one possible trusted server
+ implements that type of capabilities. Thus, $D$ can simply
+ compare the task ID of $T$ with the task ID of its trusted server
+ (authentication server, ...) to make the decision if it wants to
+ accept the capability or not.
+ \end{comment}
+
+ If $D$ does not trust $T$, it replies to $C$ (probably with an error
+ value indicating why the capability was not accepted). In that
+ case, jump to step \ref{copycapout}.
+
+ Otherwise, it requests a task info cap for $S$ from its trusted task
+ server, under the constraint that $C$ is still living.
+
+ Then $D$ sends a \verb/cap_ref_cont_accept/ RPC to the server $S$,
+ providing the task ID of the client $C$ and the reference container
+ ID $R$.
+
+\begin{comment}
+ \verb/cap_ref_cont_accept/ is one of the few interfaces that is not
+ sent to a (real) capability, of course. Nevertheless, it is part of
+ the capability object interface, hence the name. You can think of
+ it as a static member in the capability class, that does not require
+ an instance of the class.
+\end{comment}
+
+\item The server receives the \verb/cap_ref_cont_accept/ RPC from the
+ destination task $D$. It verifies that a reference container exists
+ with the ID $R$, that is associated with $D$ and $C$.
+
+ \begin{comment}
+ The server will store the reference container in data structures
+ associated with $C$, under an ID that is unique but local to $C$.
+ So $D$ needs to provide both information, the task ID and the
+ reference container ID of $C$.
+ \end{comment}
+
+ If that is the case, it takes the reference from the reference
+ container, and creates a capability ID for $D$ from it. The
+ capability ID for $D$ is returned in the reply message.
+
+ From that moment on, the reference container is deassociated from
+ $D$. It is still associated with $C$, but it does not contain any
+ reference for the capability.
+
+ \begin{comment}
+ It is not deassociated from $C$ and removed completely, so that
+ its ID $R$ (or at least the part of it that is used for $C$) is
+ not reused. $C$ must explicitely destroy the reference container
+ anyway because $D$ might die unexpectedly or return an error that
+ gives no indication if it accepted the reference or not.
+ \end{comment}
+
+\item The destination task $D$ receives the capability ID and enters
+ it into its capability system. It sends a reply message to $C$.
+
+ \begin{comment}
+ If the only purpose of the RPC was to copy the capability, the
+ reply message can be empty. Usually, capabilities will be
+ transfered as part of a larger operation, though, and more work
+ will be done by $D$ before returning to $C$.
+ \end{comment}
+
+\item \label{copycapout} The client $C$ receives the reply from $D$.
+ Irregardless if it indicated failure or success, it will now send
+ the \verb/cap_ref_cont_destroy/ message to the server $S$, providing
+ the reference container $R$.
+
+ \begin{comment}
+ This message can be a simple message. It does not require a reply
+ from the server.
+ \end{comment}
+
+\item The server receives the \verb/cap_ref_cont_destroy/ message and
+ removes the reference container $R$. The reference container is
+ deassociated from $C$ and $D$. If this was the only reason that $S$
+ held a task info cap for $D$, this task info cap is also released.
+
+ \begin{comment}
+ Because the reference container can not be deassociated from $C$
+ by any other means than this interface, the client does not need
+ to provide $D$. $R$ can not be reused without the client $C$
+ having it destroyed first. This is different from the
+ \verb/cap_ref_cont_accept/ call made by $D$, see above.
+ \end{comment}
+
+\end{enumerate}
+
+\subsubsection{Result}
+For the client $C$, nothing has changed. The destination task $D$
+either did not accept the capability, and nothing has changed for it,
+and also not for the server $S$. Or $D$ accepted the capability, and
+it now has a task info cap for $S$ and a reference to the capability
+provided by $S$. In this case, the server $S$ has a task info cap for
+$D$ and provides a capability ID for this task.
+
+The above protocol is for copying a capability from $C$ to $D$. If
+the goal was to move the capability, then $C$ can now release its
+reference to it.
+
+\begin{comment}
+ Originally we considered to move capabilities by default, and
+ require the client to acquire an additional reference if it wanted
+ to copy it instead. However, it turned out that for the
+ implementation, copying is easier to handle. One reason is that the
+ client usually will use local reference counting for the
+ capabilities it holds, and with local reference counting, one
+ server-side reference is shared by many local references. In that
+ case, you would need to acquire a new server-side reference even if
+ you want to move the capability. The other reason is cancellation.
+ If an RPC is cancelled, and you want to back out of it, you need to
+ restore the original situation. And that is easier if you do not
+ change the original situation in the first place until the natural
+ ``point of no return''.
+\end{comment}
+
+The above protocol quite obviously achieves the result as described in
+the above concluding paragraph. However, many other, and often
+simpler, protocols would also do that. The other protocols we looked
+at are not secure or robust though, or require more operations. To
+date we think that the above is the shortest (in particular in number
+of IPC operations) protocol that is also secure and robust (and if it
+is not we think it can be fixed to be secure and robust with minimal
+changes). We have no proof for its correctness. Our confidence comes
+from the scrutiny we applied to it. If you find a problem with the
+above protocol, or if you can prove various aspects of it, we would
+like to hear about it.
+
+To understand why the protocol is laid out as it is, and why it is a
+secure and robust protocol, one has to understand what could possibly
+go wrong and why it does not cause any problems for any participant if
+it follows its part of the protocol (independent on what the other
+participants do). In the following paragraphs, various scenarios are
+suggested where things do not go as expected in the above protocol.
+This is probably not a complete list, but it should come close to it.
+If you find any other problematic scenario, again, let us know.
+
+\begin{comment}
+ Although some comments like this appear in the protocol description
+ above, many comments have been spared for the following analysis of
+ potential problems. Read the analysis carefully, as it provides
+ important information about how, and more importantly, why it works.
+\end{comment}
+
+\subsubsection{The server $S$ dies}
+What happens if the server $S$ dies unexpectedly sometime throughout
+the protocol?
+
+\begin{comment}
+ At any time a task dies, the task info caps it held are released.
+ Also, task death notifications are sent to any task that holds task
+ info caps to the now dead task. The task death notifications will
+ be processed asynchrnouly, so they might be processed immediately,
+ or at any later time, even much later after the task died! So one
+ important thing to keep in mind is that the release of task info
+ caps a task held, and other tasks noticing the task death, are
+ always some time apart.
+\end{comment}
+
+Because the client $C$ holds a task info cap for $S$ no imposter can
+get the task ID of $S$. $C$ and $D$ will get errors when trying to
+send messages to $S$.
+
+\begin{comment}
+ You might now wonder what happens if $C$ also dies, or if $C$ is
+ malicious and does not hold the task info cap. You can use this as
+ an exercise, and try to find the answer on your own. The answers
+ are below.
+\end{comment}
+
+Eventually, $C$ (and $D$ if it already got the task info cap for $S$)
+will process the task death notification and clean up their state.
+
+\subsubsection{The client $C$ dies}
+The server $S$ and the destination task $D$ hold a task info cap for
+$C$, so no imposter can get its task ID. $S$ and $D$ will get errors
+when trying to send messages to $C$. Depending on when $C$ dies, the
+capability might be copied successfully or not at all.
+
+Eventually, $S$ and $D$ will process the task death notification and
+release all resources associated with $C$. If the reference was not
+yet copied, this will include the reference container associated with
+$C$, if any. If the reference was already copied, this will only
+include the empty reference container, if any.
+
+\begin{comment}
+ Of course, the participants need to use internal locking to protect
+ the integrity of their internal data structures. The above protocol
+ does not show where locks are required. In the few cases where some
+ actions must be performed atomically, a wording is used that
+ suggests that.
+\end{comment}
+
+\subsubsection{The destination task $D$ dies}
+
+The client $C$ holds a task info cap for $D$ over the whole operation,
+so no imposter can get its task ID. Depending on when $D$ dies, it
+has either not yet accepted the capability, then $C$ will clean up by
+destroying the reference container, or it has, and then $S$ will clean
+up its state when it processes the task death notification for $D$.
+
+\subsubsection{The client $C$ and the destination task $D$ die}
+
+This scenario is the reason why the server acquires its own task info
+cap for $D$ so early, and why it must do that under the constraint
+that $C$ still lives. If $C$ and $D$ die before the server created
+the reference container, then either no request was made, or creating
+the task info cap for $D$ fails because of the constraint. If $C$ and
+$D$ die afterwards, then no imposter can get the task ID of $D$ and
+try to get at the reference in the container, because the server has
+its own task info cap for $D$.
+
+\begin{comment}
+ This problem was identified very late in the development of this
+ protocol. We just did not think of both clients dieing at the same
+ time! In an earlier version of the protocol, the server would
+ acquire its task info cap when $D$ accepts its reference. This is
+ too late: If $C$ and $D$ die just before that, an imposter with
+ $D$'s task ID can try to get the reference in the container before
+ the server processes the task death notification for $C$ and
+ destroys it.
+\end{comment}
+
+Eventually, the server will receive and process the task death
+notifications. If it processes the task death notification for $C$
+first, it will destroy the whole container immediately, including the
+reference, if any. If it processes the task death notification for
+$D$ first, it will destroy the reference, and leave behind the empty
+container associated with $C$, until the other task death notification
+is processed. Either way no imposter can get at the capability.
+
+Of course, if the capability was already copied at the time $C$ and
+$D$ die, the server will just do the normal cleanup.
+
+\subsubsection{The client $C$ and the server $S$ die}
+
+This scenario does not cause any problems, because on the one hand,
+the destination task $D$ holds a task info cap for $C$, and it
+acquires its own task info cap for $S$. Although it does this quite
+late in the protocol, it does so under the constraint that $C$ still
+lives, which has a task info cap for $S$ for the whole time (until it
+dies). It also gets the task info cap for $S$ before sending any
+message to it. An imposter with the task ID of $S$, which it was
+possible to get because $C$ died early, would not receive any message
+from $D$ because $D$ uses $C$ as its constraint in acquireing the task
+info cap for $S$.
+
+\subsubsection{The destination task $D$ and the server $S$ die}
+
+As $C$ holds task info caps for $S$ and $D$, there is nothing that can
+go wrong here. Eventually, the task death notifications are
+processed, but the task info caps are not released until the protocol
+is completed or aborted because of errors.
+
+\subsubsection{The client $C$, the destination task $D$ and the server $S$ die}
+
+Before the last one of these dies, you are in one of the scenarios
+which already have been covered. After the last one dies, there is
+nothing to take care of anymore.
+
+\begin{comment}
+ In this case your problem is probably not the capability copy
+ protocol, but the stability of your software! Go fix some bugs.
+\end{comment}
+
+So far the scenarios where one or more of the participating tasks die
+unexpectedly. They could also die purposefully. Other things that
+tasks can try to do purposefully to break the protocol are presented
+in the following paragraphs.
+
+\begin{comment}
+ A task that tries to harm other tasks by not following a protocol
+ and behaving as other tasks might expect it is malicious. Beside
+ security concerns, this is also an issue of robustness, because
+ malicious behaviour can also be triggered by bugs rather than bad
+ intentions.
+
+ It is difficult to protect against malicious behaviour by trusted
+ components, like the server $S$, which is trusted by both $C$ and
+ $D$. If a trusted component is compromised or buggy, ill
+ consequences for software that trusts it must be expected. Thus, no
+ analysis is provided for scenarious involving a malicious or buggy
+ server $S$.
+\end{comment}
+
+\subsubsection{The client $C$ is malicious}
+
+If the client $C$ wants to break the protocol, it has numerous
+possibilities to do so. The first thing it can do is to provide a
+wrong destination task ID when creating the container. But in this
+case, the server will return an error to $D$ when it tries to accept
+it, and this will give $D$ a chance to notice the problem and clean
+up. This also would allow for some other task to receive the
+container, but the client can give the capability to any other task it
+wants to anyway, so this is not a problem.
+
+\begin{comment}
+ If a malicious behaviour results in an outcome that can also be
+ achieved following the normal protocol with different parameters,
+ then this not a problem at all.
+\end{comment}
+
+The client could also try to create a reference container for $D$ and
+then not tell $D$ about it. However, a reference container should not
+consume a lot of resources in the server, and all such resources
+should be attributed to $C$. When $C$ dies eventually, the server
+will clean up any such pending containers when the task death
+notification is processed.
+
+The same argument holds when $C$ leaves out the call to
+\verb/cap_ref_cont_destroy/.
+
+The client $C$ could also provide wrong information to $D$. It could
+supply a wrong server thread ID $T$. It could supply a wrong
+reference container ID $R$. If $D$ does not trust $C$ and expects a
+capability implemented by some specific trusted server, it will verify
+the thread ID numerically and reject it if it does not match. The
+reference container ID will be verified by the server, and it will
+only be accepted if the reference container was created by the client
+task $C$. Thus, the only wrong reference container IDs that the
+client $C$ could use to not provoke an error message from the server
+(which then lead $D$ to abort the operation) would be a reference
+container that it created itself in the first place. However, $C$
+already is frree to send $D$ any reference container it created.
+
+\begin{comment}
+ Again $C$ can not achieve anything it could not achieve by just
+ following the protocol as well. If $C$ tries to use the same
+ reference container with several RPCs in $D$, one of them would
+ succeed and the others would fail, hurting only $C$.
+
+ If $D$ does trust $C$, then it can not protect against malicious
+ behaviour by $C$.
+\end{comment}
+
+To summarize the result so far: $C$ can provide wrong data in the
+operations it does, but it can not achieve anything this way that it
+could not achieve by just following the protocol. In most cases the
+operation would just fail. If it leaves out some operations, trying
+to provoke resource leaks in the server, it will only hurt itself (as
+the reference container is strictly associated with $C$ until the
+reference is accepted by $D$).
+
+\begin{comment}
+ For optimum performance, the server should be able to keep the
+ information about the capabilities and reference containers a client
+ holds on memory that is allocated on the clients behalf.
+
+ It might also use some type of quota system.
+\end{comment}
+
+Another attack that $C$ can attempt is to deny a service that $S$ and
+$D$ are expecting of it. Beside not doing one or more of the RPCs,
+this is in particular holding the task info caps for the time span as
+described in the protocol. Of course, this can only be potentially
+dangerous in combination with a task death. If $C$ does not hold the
+server task info capability, then an imposter of $S$ could trick $D$
+into using the imposter as the server. However, this is only possible
+if $D$ already trusts $C$. Otherwise it would only allow servers that
+it already trusts, and it would always hold task info caps to such
+trusted servers when making the decision that it trusts them.
+However, if $D$ trusts $C$, it can not protect against $C$ being
+malicious.
+
+\begin{comment}
+ If $D$ does not trust $C$, it should only ever compare the task ID
+ of the server thread against trusted servers it has a task info cap
+ for. It must not rely on $C$ doing that for $D$.
+
+ However, if $D$ does trust $C$, it can rely on $C$ holding the
+ server task info cap until it got its own. Thus, the task ID of $C$
+ can be used as the constraint when acquiring the task info cap in
+ the protocol.
+\end{comment}
+
+If $C$ does not hold the task info cap of $D$, and $D$ dies before the
+server acquires its task info cap for $D$, it might get a task info
+cap for an imposter of $D$. But if the client wants to achieve that,
+it could just follow the protocol with the imposter as the destination
+task.
+
+\subsubsection{The destination task $D$ is malicious}
+
+The destination task has not as many possibilities as $C$ to attack
+the protocol. This is because it is trusted by $C$. So the only
+participant that $D$ can try to attack is the server $S$. But the
+server $S$ does not rely on any action by $D$. $D$ does not hold any
+task info caps for $S$. The only operation it does is an RPC to $S$
+accepting the capability, and if it omits that it will just not get
+the capability (the reference will be cleaned up by $C$ or by the
+server when $C$ dies).
+
+The only thing that $D$ could try is to provide false information in
+the \verb/cap_ref_cont_accept/ RPC. The information in that RPC is
+the task ID of the client $C$ and the reference container ID $R$. The
+server will verify that the client $C$ has previously created a
+reference container with the ID $R$ that is destined for $D$. So $D$
+will only be able to accept references that it is granted access to.
+So it can not achieve anything that it could not achieve by following
+the protocol (possibly the protocol with another client). If $D$
+accepts capabilities from other transactions outside of the protocol,
+it can only cause other transactions in its own task to fail.
+
+\begin{comment}
+ If you can do something wrong and harm yourself that way, then this
+ is called ``shooting yourself in your foot''.
+
+ The destination task $D$ is welcome to shoot itself in its foot.
+\end{comment}
+
+\subsubsection{The client $C$ and the destination task $D$ are malicious}
+
+The final question we want to raise is what can happen if the client
+$C$ and the destination task $D$ are malicious. Can $C$ and $D$
+cooperate and attacking $S$ in a way that $C$ or $D$ alone could not?
+
+In the above analysis, there is no place where we assume any specific
+behaviour of $D$ to help $S$ in preventing an attack on $S$. There is
+only one place where we make an assumption for $C$ in the analysis of
+a malicious $D$. If $D$ does not accept a reference container, we
+said that $C$ would clean it up by calling
+\verb/cap_ref_cont_destroy/. So we have to look at what would happen
+if $C$ were not to do that.
+
+Luckily, we covered this case already. It is identical to the case
+where $C$ does not even tell $D$ about the reference container and
+just do nothing. In this case, as said before, the server will
+eventually release the reference container when $C$ dies. Before
+that, it only occupies resources in the server that are associated
+with $C$.
+
+This analysis is sketchy in parts, but it covers a broad range of
+possible attacks. For example, all possible and relevant combinations
+of task deaths and malicious tasks are covered. Although by no means
+complete, it can give us some confidence about the rightness of the
+protocol. It also provides a good set of test cases that you can test
+your own protocols, and improvements to the above protocol against.
+
+
+\subsection{The trust rule}
+
+The protocol to copy a capability from one client to another task has
+a dramatic consequence on the design of the Hurd interfaces.
+
+Because the receiver of the capability must make blocking calls to the
+server providing the capability, the receiver of the capability
+\emph{must} trust the server providing the capability.
+
+This means also: If the receiver of a capability does not trust the
+server providing the capability, it \emph{must not} accept it.
+
+The consequence is that normally, servers can not accept capabilities
+from clients, unless they are provided by a specific trusted server.
+This can be the \texttt{task} or \texttt{auth} server for example.
+
+This rule is even true if the receiver does not actually want to use
+the capability for anything. Just accepting the capability requires
+trusting the server providing it already.
+
+In the Hurd on Mach, ports (which are analogous to capabilities in
+this context) can be passed around freely. There is no security risk
+in accepting a port from any source, because the kernel implements
+them as protected objects. Using a port by sending blocking messages
+to it requires trust, but simply storing the port on the server side
+does not.
+
+This is different in the Hurd on L4: A server must not accept
+capabilities unless it trusts the server providing them. Because
+capabilities are used for many different purposes (remote objects,
+authentication, identification), one has to be very careful in
+designing the interfaces. The Hurd interfaces on Mach use ports in a
+way that is not possible on L4. Such interfaces need to be
+redesigned.
+
+Often, redesigning such an interface also fixes some other security
+problems that exists with in the Hurd on L4, in particular DoS
+attacks. A good part of this paper is about redesigning the Hurd to
+avoid storing untrusted capabilities on the server side.
+
+\begin{comment}
+ Examples are:
+
+ \begin{itemize}
+ \item The new authentication protocol, which eliminates the need for
+ a rendezvous port and is not only faster, but also does not
+ require the server to block on the client anymore (see section
+ \ref{auth} on page \pageref{auth}).
+
+ \item The signal handling, which does not require the \texttt{proc}
+ server to hold the signal port for every task anymore (see section
+ \ref{signals} on page \pageref{signals}).
+
+ \item The new exec protocol, which eliminates the need to pass all
+ capabilities that need to be transfered to the new executable from
+ the old program to the filesystem server, and then to the
+ \texttt{exec} server (see section \ref{exec} on page
+ \pageref{exec}).
+
+ \item The new way to implement Unix Domain Sockets, which don't
+ require a trusted system server, so that descriptor passing (which
+ is really capability passing) can work (see section
+ \ref{unixdomainsockets} on page \pageref{unixdomainsockets}.
+
+ \item The way parent and child filesystem are linked to each other,
+ in other words: how mounting a filesystem works (see section
+ \ref{xfslookup} on page \pageref{xfslookup}).
+
+ \item The replacement for the \verb/file_reparent()/ RPC (see
+ section \ref{reparenting} on page \pageref{reparenting}).
+ \end{itemize}
+\end{comment}
+
+\section{Synchronous IPC}
+
+The Hurd only needs synchronous IPC. Asynchronous IPC is usually not
+required. An exception are notifications (see below).
+
+There are possibly some places in the Hurd source code where
+asynchronous IPC is assumed. These must be replaced with different
+strategies. One example is the implementation of select() in the GNU
+C library.
+
+\begin{comment}
+ A naive implementation would use one thread per capability to select
+ on. A better one would combine all capabilities implemented by the
+ same server in one array and use one thread per server.
+
+ A more complex scheme might let the server process select() calls
+ asynchronously and report the result back via notifications.
+\end{comment}
+
+In other cases the Hurd receives the reply asynchronously from sending
+the message. This works fine in Mach, because send-once rights are
+used as reply ports and Mach guarantees to deliver the reply message,
+ignoring the kernel queue limit. In L4, no messages are queued and
+such places need to be rewritten in a different way (for example using
+extra threads).
+
+\begin{comment}
+ What happens if a client does not go into the receive phase after a
+ send, but instead does another send, and another one, quickly many
+ sends, as fast as possible? A carelessly written server might
+ create worker threads for each request. Instead, the server should
+ probably reject to accept a request from a client thread that
+ already has a pending request, so the number of worker threads is
+ limited to the number of client threads.
+
+ This also makes interrupting an RPC operation easier (the client
+ thread ID can be used to identify the request to interrupt).
+\end{comment}
+
+
+\section{Notifications}
+
+Notifications to untrusted tasks happen frequently. One case is
+object death notifications, in particular task death notifications.
+Other cases might be select() or notifications of changes to the
+filesystem.
+
+The console uses notifications to broadcast change events to the
+console content, but it also uses shared memory to broadcast the
+actual data, so not all notifications need to be received for
+functional operation. Still, at least one notification is queued by
+Mach, and this is sufficient for the console to wakeup whenever
+changes happened, even if the changes can not be processed
+immediately.
+
+From the servers point of view, notifications are simply messages with
+a send and xfer timeout of 0 and without a receive phase.
+
+For the client, however, there is only one way to ensure that it will
+receive the notification: It must have the receiving thread in the
+receive phase of an IPC. While this thread is processing the
+notification (even if it is only delegating it), it might be preempted
+and another (or the same) server might try to send a second
+notification.
+
+\begin{comment}
+ It is an open challenge how the client can ensure that it either
+ receives the notification or at least knows that it missed it, while
+ the server remains save from potential DoS attacks. The usual
+ strategy, to give receivers of notifications a higher scheduling
+ priority than the sender, is not usable in a system with untrusted
+ receivers (like the Hurd). The best strategy determined so far is
+ to have the servers retry to send the notification several times
+ with small delays inbetween. This can increase the chance that a
+ client is able to receive the notification. However, there is still
+ the question what a server can do if the client is not ready.
+
+ An alternative might be a global trusted notification server that
+ runs at a higher scheduling priority and records which servers have
+ notifications for which clients, and that can be used by clients to
+ be notified of pending notifications. Then the clients can poll the
+ notifications from the servers.
+\end{comment}
+
+
diff --git a/doc/posix.tex b/doc/posix.tex
new file mode 100644
index 0000000..953f1ea
--- /dev/null
+++ b/doc/posix.tex
@@ -0,0 +1,403 @@
+\chapter{The POSIX personality}
+
+The Hurd offers a POSIX API to the user by default. This is
+implemented in the GNU C library which uses the services provided by
+the Hurd servers. Several system servers support the C library.
+
+
+\section{Process Management}
+\label{proc}
+
+The \texttt{proc} server implements Unix process semantics in the Hurd
+system. It will also assign a PID to each task that was created with
+the \texttt{task} server, so that the owner of these tasks, and the
+system administrator, can at least send the \verb/SIGKILL/ signal to
+them.
+
+The \texttt{proc} server uses the task manager capability from the
+\texttt{task} server to get hold of the information about all tasks
+and the task control caps.
+
+\begin{comment}
+ The \texttt{proc} server might also be the natural place to
+ implement a first policy server for the \texttt{task} server.
+\end{comment}
+
+
+\subsection{Signals}
+\label{signals}
+
+Each process can register the thread ID of a signal thread with the
+\texttt{proc} server. The proc server will give the signal thread ID
+to any other task which asks for it.
+
+\begin{comment}
+ The thread ID can be guessed, so there is no point in protecting it.
+\end{comment}
+
+The signal thread ID can then be used by a task to contact the task to
+which it wants to send a signal. The task must bootstrap its
+connection with the intended receiver of the signal, according to the
+protocol described in section \ref{ipcbootstrap} on page
+\pageref{ipcbootstrap}. As a result, it will receive the signal
+capability of the receiving task.
+
+The sender of a signal must then provide some capability that proves
+that the sender is allowed to send the signal when a signal is posted
+to the signal capability. For example, the owner of the task control
+cap is usually allowed to send any signal to it. Other capabilities
+might only give permission to send some types of signals.
+
+\begin{comment}
+ The receiver of the signal decides itself which signals to accept
+ from which other tasks. The default implementation in the C library
+ provides POSIX semantics, plus some extensions.
+\end{comment}
+
+Signal handling is thus completely implemented locally in each task.
+The \texttt{proc} server only serves as a name-server for the thread
+IDs of the signal threads.
+
+\begin{comment}
+ The \texttt{proc} server can not hold the signal capability itself,
+ as it used to do in the implementation on Mach, as it does not trust
+ the tasks implementing the capability. But this is not a problem,
+ as the sender and receiver of a signal can negotiate and bootstrap
+ the connection without any further support by the \texttt{proc}
+ server.
+
+ Also, the \texttt{proc} server can not even hold task info caps to
+ support the sender of a signal in bootstrapping the connection.
+ This means that there is a race between looking up the signal thread
+ ID from the PID in the \texttt{proc} server and acquiring a task
+ info cap for the task ID of the signal receiver in the sender.
+ However, in Unix, there is always a race when sending a signal using
+ \verb/kill/. The task server helps the users a bit here by not
+ reusing task IDs as long as possible.
+\end{comment}
+
+Some signals are not implemented by sending a message to the task.
+\verb/SIGKILL/ for example destroys the tasks without contacting it at
+all. This feature is implemented in the \texttt{proc} server.
+
+The signal capability is also used for other things, like the message
+interface (which allows you to manipulate the environment variables
+and \texttt{auth} capability of a running task, etc).
+
+
+\subsection{The \texttt{fork()} function}
+
+To be written.
+
+
+\subsection{The \texttt{exec()} function}
+\label{exec}
+
+The \texttt{exec()} operation will be done locally in a task.
+Traditionally, \texttt{exec()} overlays the same task with a new
+process image, because creating a new task and transferring the
+associated state is expensive. In L4, only the threads and virtual
+memory mappings are actually kernel state associated with a task, and
+exactly those have to be destroyed by \texttt{exec()} anyway. There
+is a lot of Hurd specific state associated with a task (capabilities,
+for example), but it is difficult to preserve that. There are
+security concerns, because POSIX programs do not know about Hurd
+features like capabilities, so inheriting all capabilities across
+\texttt{exec()} unconditionally seems dangerous.
+
+\begin{comment}
+ One could think that if a program is not Hurd-aware, then it will
+ not make any use of capabilities except through the normal POSIX
+ API, and thus there are no capabilities except those that the GNU C
+ library uses itself, which \texttt{exec()} can take care of.
+ However, this is only true if code that is not Hurd-aware is never
+ mixed with Hurd specific code, even libraries (unless the library
+ intimately cooperates with the GNU C library). This would be a high
+ barrier to enable Hurd features in otherwise portable programs and
+ libraries.
+
+ It is better to make all POSIX functions safe by default and allow
+ for extensions to let the user specify which capabilities besides
+ those used for file descriptors etc to be inherited by the new
+ executable.
+
+ For \verb/posix_spawn()/, this is straight-forward. For
+ \texttt{exec()}, it is not. either specific capabilities could be
+ markes as ``do not close on \texttt{exec()}'', or variants of the
+ \texttt{exec()} function could be provided which take further
+ arguments.
+\end{comment}
+
+There are also implementation obstacles hindering the reuse of the
+existing task. Only local threads can manipulate the virtual memory
+mappings, and there is a lot of local state that has to be kept
+somewhere between the time the old program becomes defunct and the new
+binary image is installed and used (not to speak of the actual program
+snippet that runs during the transition).
+
+So the decision was made to always create a new task with
+\texttt{exec()}, and copy the desired state from the current task to
+the new task. This is a clean solution, because a new task will
+always start out without any capabilities in servers, etc, and thus
+there is no need for the old task to try to destroy all unneeded
+capabilities and other local state before \texttt{exec()}. Also, in
+case the exec fails, the old program can continue to run, even if the
+exec fails at a very late point (there is no ``point of no return''
+until the new task is actually up and running).
+
+For suid and sgid applications, the actual \texttt{exec()} has to be
+done by the filesystem. However, the filesystem can not be bothered
+to also transfer all the user state into the new task. It can not
+even do that, because it can not accept capabilities implemented by
+untrusted servers from the user. Also, the filesystem does not want
+to rely on the new task to be cooperative, because it does not
+necessarily trust the code, if is is owned by an untrusted user.
+
+\begin{enumerate}
+\item The user creates a new task and a container with a single
+ physical page, and makes the \texttt{exec()} call to the file
+ capability, providing the task control capability. Before that, it
+ creates a task info capability from it for its own use.
+\item The filesystem checks permission and then revokes all other
+ users on the task control capability. This will revoke the users
+ access to the task, and will fail if the user did not provide a
+ pristine task object. (It is assumed that the filesystem should not
+ create the task itself so the user can not use suid/sgid
+ applications to escape from their quota restriction).
+\item Then it revokes access to the provided physical page and writes
+ a trusted startup code to it.
+\item The filesystem will also prepare all capability transactions and
+ write the required information (together with other useful
+ information) in a stack on the physical page.
+\item Then it creates a thread in the task, and starts it. At
+ pagefault, it will provide the physical page.
+\item The startup code on the physical page completes the capability
+ transfer. It will also install a small pager that can install file
+ mappings for this binary image. Then it jumps to the entry point.
+\item The filesystem in the meanwhile has done all it can do to help
+ the task startup. It will provide the content of the binary or
+ script via paging or file reads, but that happens asynchronously,
+ and as for any other task. So the filesystem returns to the client.
+\item The client can then send its untrusted information to the new
+ task. The new task got the client's thread ID from the filesystem
+ (possibly provided by the client), and thus knows to which thread it
+ should listen. The new task will not trust this information
+ ultimatively (ie, the new task will use the authentication, root
+ directory and other capabilities it got from the filesystem), but it
+ will accept all capabilities and make proper use of them.
+\item Then the new task will send a message to proc to take over the
+ old PID and other process state. How this can be done best is still
+ to be determined (likely the old task will provide a process control
+ capability to the new task). At that moment, the old task is
+ desrtoyed by the proc server.
+\end{enumerate}
+
+This is a coarse and incomplete description, but it shows the general
+idea. The details will depend a lot on the actual implementation.
+
+
+\section{Unix Domain Sockets}
+\label{unixdomainsockets}
+
+In the Hurd on Mach, there was a global pflocal server that provided
+unix domain sockets and pipes to all users. This will not work very
+well in the Hurd on L4, because for descriptor passing, read:
+capability passing, the unix domain socket server needs to accept
+capabilities in transit. User capabilities are often implemented by
+untrusted servers, though, and thus a global pflocal server running as
+root can not accept them.
+
+However, unix domain sockets and pipes can not be implemented locally
+in the task. An external task is needed to hold buffered data
+capabilities in transit. in theory, a new task could be used for
+every pipe or unix domain socketpair. However, in practice, one
+server for each user would suffice and perform better.
+
+This works, because access to Unix Domain Sockets is controlled via
+the filesystem, and access to pipes is controlled via file
+descriptors, usually by inheritance. For example, if a fifo is
+installed as a passive translator in the filesystem, the first user
+accessing it will create a pipe in his pflocal server. From then on,
+an active translator must be installed in the node that redirects any
+other users to the right pflocal server implementing this fifo. This
+is asymmetrical in that the first user to access a fifo will implement
+it, and thus pay the costs for it. But it does not seem to cause any
+particular problems in implementing the POSIX semantics.
+
+The GNU C library can contact ~/servers/socket/pflocal to implement
+socketpair, or start a pflocal server for this task's exclusive use if
+that node does not exist.
+
+All this are optimizations: It should work to have one pflocal process
+for each socketpair. However, performance should be better with a
+shared pflocal server, one per user.
+
+
+\section{Pipes}
+
+Pipes are implemented using \texttt{socketpair()}, that means as
+unnamed pair of Unix Domain Sockets. The \texttt{pflocal} server will
+support this by implementing pipe semantics on the socketpair if
+requested.
+
+\begin{comment}
+ It was considered to use shared memory for the pipe implementation.
+ But we are not aware of a lock-free protocol using shared memory
+ with multiple readers and multiple writers. It might be possible,
+ but it is not obvious if that would be faster: Pipes are normally
+ used with \texttt{read()} and \texttt{write()}, so the data has to
+ be copied from and to the supplied buffer. This can be done
+ efficiently in L4 even across address spaces using string items. In
+ the implementation using sockets, the \texttt{pflocal} server
+ handles concurrent read and write accesses with mutual exclusion.
+\end{comment}
+
+
+\section{Filesystems}
+
+\subsection{Directory lookup across filesystems}
+\label{xfslookup}
+
+The Hurd has the ability to let users mount filesystems and other
+servers providing a filesystem-like interface. Such filesystem
+servers are called translators. In the Hurd on GNU Mach, the parent
+filesystem would automatically start up such translators from passive
+translator settings in the inode. It would then block until the child
+filesystem sends a message to its bootstrap port (provided by the
+parent fs) with its root directory port. This root directory port can
+then be given to any client looking up the translated node.
+
+There are several things wrong with this scheme, which becomes
+apparent in the Hurd on L4. The parent filesystem must be careful to
+not block on creating the child filesystem task. It must also be
+careful to not block on receiving any acknowledgement or startup
+message from it. Furthermore, it can not accept the root directory
+capability from the child filesystem and forward it to clients, as
+they are potentially not trusted.
+
+The latter problem can be solved the following way: The filesystem
+knows about the server thread in the child filesystem. It also
+implements an authentication capability that represents the ability to
+access the child filesystem. This capability is also given to the
+child filesystem at startup (or when it attaches itself to the parent
+filesystem). On client dir\_lookup, the parent filesystem can return
+the server\_thread and the authentication capability to the client.
+The client can use that to initiate a connection with the child
+filesystem (by first building up a connection, then sending the
+authentication capability from the parent filesystem, and receiving a
+root directory capability in exchange).
+
+\begin{comment}
+ There is a race here. If the child filesystem dies and the parent
+ filesystem processes the task death notification and releases the
+ task info cap for the child before the user acquires its own task
+ info cap for the child, then an imposter might be able to pretend to
+ be the child filesystem for the client.
+
+ This race can only be avoided by a more complex protocol:
+
+ Variant 1: The user has to acquire the task info cap for the child
+ fs, and then it has to perform the lookup again. If then the thread
+ ID is for the task it got the task ID for in advance, it can go on.
+ If not, it has to retry. This is not so good because a directory
+ lookup is usually an expensive operation. However, it has the
+ advantage of only slowing down the rare case.
+
+ Variant 2: The client creates an empty reference container in the
+ task server, which can then be used by the server to fill in a
+ reference to the child's task ID. However, the client has to create
+ and destroy such a container for every filesystem where it excepts
+ it could be redirected to another (that means: for all filesystems
+ for which it does not use \verb/O_NOTRANS/). This is quite an
+ overhead to the common case.
+
+\begin{verbatim}
+<marcus> I have another idea
+<marcus> the client does not give a container
+<marcus> server sees child fs, no container -> returns O_NOTRANS node
+<marcus> then client sees error, uses O_NOTRANS node, "" and container
+<marcus> problem solved
+<marcus> this seems to be the optimum
+<neal> hmm.
+<neal> So lazily supply a container.
+<marcus> yeah
+<neal> Hoping you won't need one.
+<marcus> and the server helps you by doing as much as it can usefully
+<neal> And that is the normal case.
+<neal> Yeah, that seems reasonable.
+<marcus> the trick is that the server won't fail completely
+<marcus> it will give you at least the underlying node
+\end{verbatim}
+\end{comment}
+
+The actual creation of the child filesystem can be performed much like
+a suid exec, just without any client to follow up with further
+capabilities and startup info. The only problem that remains is how
+the parent filesystem can know which thread in the child filesystem
+implements the initial handshake protocol for the clients to use. The
+only safe way here seems to be that the parent filesystem requires the
+child to use the main thread for that, or that the parent filesystem
+creates a second thread in the child at startup (passing its thread ID
+in the startup data), requiring that this second thread is used. In
+either case the parent filesystem will know the thread ID in advance
+because it created the thread in the first place. This looks a bit
+ugly, and violates good taste, so we might try to look for alternative
+solutions.
+
+
+\subsection{Reparenting}
+\label{reparenting}
+
+The Hurd on Mach contains a curious RPC, \verb/file_reparent/, which
+allows you to create a new capability for the same node, with the
+difference that the new node will have a supplied capability as its
+parent node. A directory lookup of \texttt{..} on this new capability
+would return the provided parent capability.
+
+This function is used by the \texttt{chroot()} function, which sets
+the parent node to the null capability to prevent escape from a
+\texttt{chroot()} environment. It is also used by the
+\texttt{firmlink} translator, which is a cross over of a symbolic and
+a hard link: It works like a hard link, but can be used across
+filesystems.
+
+A firmlink is a dangerous thing. Because the filesystem will give no
+indication if the parent node it returns is provided by itself or some
+other, possibly untrusted filesystem, the user might follow the parent
+node to untrusted filesystems without being aware of it.
+
+In the Hurd port to L4, the filesystem can not accept untrusted parent
+capabilities on behalf of the user anymore. The \texttt{chroot()}
+function is not difficult to implement anyway, as no real capability
+is required. The server can just be instructed to create a node with
+no parent node, and it can do that without problems. Nevertheless, we
+also want a secure version of the \texttt{firmlink} translator. This
+is possible if the same strategy is used as in cross filesystem
+lookups. The client registers a server thread as the handler for the
+parent node, and the filesystem returns a capability that can be used
+for authentication purposes. Now, the client still needs to connect
+this to the new parent node. Normally, the filesystem providing the
+new parent node will also not trust the other filesystem, and thus can
+not accept the capability that should be used for authentication
+purposes. So instead creating a direct link from the one filesystem
+to the other, the firmlink translator must act as a middle man, and
+redirect all accesses to the parent node first to itself, and then to
+the filesystem providing the parent node. For this, it must request a
+capability from that filesystem that can be used for authentication
+purposes when bootstrapping a connection, that allows such a
+bootstrapping client to access the parent node directly.
+
+This also fixes the security issues, because now any move away from
+the filesystem providing the reparented node will explicitely go first
+to the \texttt{firmlink} translator, and then to the filesystem
+providing the parent node. The user can thus make an informed
+decision if it trusts the \texttt{firmlink} translator and the
+filesystem providing the parent node.
+
+\begin{comment}
+ This is a good example where the redesign of the IPC system forces
+ us to fix a security issue and provides a deeper insight into the
+ trust issues and how to solve them.
+\end{comment}
+
+
diff --git a/doc/threads-tasks.tex b/doc/threads-tasks.tex
new file mode 100644
index 0000000..07e691f
--- /dev/null
+++ b/doc/threads-tasks.tex
@@ -0,0 +1,235 @@
+\chapter{Threads and Tasks}
+
+The \texttt{task} server will provide the ability to create tasks and
+threads, and to destroy them.
+
+\begin{comment}
+ In L4, only threads in the privileged address space (the rootserver)
+ are allowed to manipulate threads and address spaces (using the
+ \textsc{ThreadControl} and \textsc{SpaceControl} system calls). The
+ \texttt{task} server will use the system call wrappers provided by
+ the rootserver, see section \ref{rootserver} on page
+ \pageref{rootserver}.
+\end{comment}
+
+The \texttt{task} server provides three different capability types.
+
+\subsubsection{Task control capabilities}
+If a new task is created, it is always associated with a task control
+capability. The task control capability can be used to create and
+destroy threads in the task, and destroy the task itself. So the task
+control capability gives the owner of a task control over it. Task
+control capabilities have the side effect that the task ID of this
+task is not reused, as long as the task control capability is not
+released. Thus, having a task control capability affects the global
+namespace of task IDs. If a task is destroyed, task death
+notifications are sent to holders of task control capabilities for
+that task.
+
+\begin{comment}
+ A task is also implicitely destroyed when the last task control
+ capability reference is released.
+\end{comment}
+
+\subsubsection{Task info capabilities}
+\label{taskinfocap}
+Any task can create task info capabilities for other tasks. Such task
+info capabilities are used mainly in the IPC system (see section
+\ref{ipc} on page \pageref{ipc}). Task info capabilities have the
+side effect that the task ID of this task is not reused, as long as
+the task info capability is not released. Thus, having a task info
+capability affects the global namespace of task IDs. If a task is
+destroyed, task death notifications are sent to holders of task info
+capabilities for that task.
+
+\begin{comment}
+ Because of that, holding task info capabilities must be restricted
+ somehow. Several strategies can be taken:
+
+ \begin{itemize}
+ \item Task death notifications can be monitored. If there is no
+ acknowdgement within a certain time period, the \texttt{task}
+ server could be allowed to reuse the task ID anyway. This is not
+ a good strategy because it can considerably weaken the security of
+ the system (capabilities might be leaked to tasks which reuse such
+ a task ID reclaimed by force).
+ \item The proc server can show dead task IDs which are not released
+ yet, in analogy to the zombie processes in Unix. It can also make
+ available the list of tasks which prevent reusing the task ID, to
+ allow users or the system administrator to clean up manually.
+ \item Quotas can be used to punish users which do not acknowledge
+ task death timely. For example, if the number of tasks the user
+ is allowed to create is restricted, the task info caps that the
+ user holds for dead tasks could be counted toward that limit.
+ \item Any task could be restricted to as many task ID references as
+ there are live tasks in the system, plus some slack. That would
+ prevent the task from creating new task info caps if it does not
+ release old ones from death tasks. The slack would be provided to
+ not unnecessarily slow down a task that processes task death
+ notifications asynchronously to making connections with new tasks.
+ \end{itemize}
+
+ In particular the last two approaches should proof to be effective
+ in providing an incentive for tasks to release task info caps they
+ do not need anymore.
+\end{comment}
+
+\subsubsection{Task manager capability}
+A task is a relatively simple object, compared to a full blown POSIX
+process, for example. As the \texttt{task} server is enforced system
+code, the Hurd does not impose POSIX process semantics in the task
+server. Instead, POSIX process semantics are implemented in a
+different server, the proc server (see also section \ref{proc} on page
+\pageref{proc}). To allow the \texttt{proc} server to do its work, it
+needs to be able to get the task control capability for any task, and
+gather other statistics about them. Furthermore, there must be the
+possibility to install quota mechanisms and other monitoring systems.
+The \texttt{task} server provides a task manager capability, that
+allows the holder of that capability to control the behaviour of the
+\texttt{task} server and get access to the information and objects it
+provides.
+
+\begin{comment}
+ For example, the task manager capability could be used to install a
+ policy capability that is used by the \texttt{task} server to make
+ upcalls to a policy server whenever a new task or thread is created.
+ The policy server could then indicate if the creation of the task or
+ thread is allowed by that user. For this to work, the \texttt{task}
+ server itself does not need to know about the concept of a user, or
+ the policies that the policy server implements.
+
+ Now that I am writing this, I realize that without any further
+ support by the \texttt{task} server, the policy server would be
+ restricted to the task and thread ID of the caller (or rather the
+ task control capability used) to make its decision. A more
+ capability oriented approach would then not be possible. This
+ requires more thought.
+
+ The whole task manager interface is not written yet.
+\end{comment}
+
+When creating a new task, the \texttt{task} server allocates a new
+task ID for it. The task ID will be used as the version field of the
+thread ID of all threads created in the task. This allows the
+recipient of a message to verify the sender's task ID efficiently and
+easily.
+
+\begin{comment}
+ The version field is 14 bit on 32-bit architectures, and 32 bit on
+ 64 bit architectures. Because the lower six bits must not be all
+ zero (to make global thread IDs different from local thread IDs),
+ the number of available task IDs is $2^{14} - 2^6$ resp. $2^{32} -
+ 2^6$.
+
+ If several systems are running in parallel on the same host, they
+ might share thread IDs by encoding the system ID in the upper bits
+ of the thread number.
+\end{comment}
+
+Task IDs will be reused only if there are no task control or info
+capabilities for that task ID held by any task in the system. To
+support bootstrapping an IPC connection (see section
+\ref{ipcbootstrap} on page \pageref{ipcbootstrap}), the \texttt{task}
+server will delay reusing a task ID as long as possible.
+
+\begin{comment}
+ This is similar to how PIDs are generated in Unix. Although it is
+ attempted to keep PIDs small for ease of use, PIDs are not reused
+ immediately. Instead, the PID is incremented up to a certain
+ maximum number, and only then smaller PID values are reused again.
+
+ As task IDs are not a user interface, there is no need to keep them
+ small. The whole available range can be used to delay reusing a
+ task ID as long as possible.
+\end{comment}
+
+When creating a new task, the \texttt{task} server also has to create
+the initial thread. This thread will be inactive. Once the creation
+and activation of the initial thread has been requested by the user,
+it will be activated. When the user requests to destroy the last
+thread in a task, the \texttt{task} server makes that thread inactive
+again.
+
+\begin{comment}
+ In L4, an address space can only be implicitely created (resp.
+ destroyed) with the first (resp. last) thread in that address space.
+\end{comment}
+
+Some operations, like starting and stopping threads in a task, can not
+be supported by the task server, but have to be implemented locally in
+each task because of the minimality of L4. If external control over
+the threads in a task at this level is required, the debugger
+interface might be used (see section \ref{debug} on page
+\pageref{debug}).
+
+
+\section{Accounting}
+
+We want to allow the users of the system to use the \texttt{task}
+server directly, and ignore other task management facilities like the
+\texttt{proc} server. However, the system administrator still needs
+to be able to identify the user who created such anonymous tasks.
+
+For this, a simple accounting mechanism is provided by the task
+server. An identifier can be set for a task by the task manager
+capability, which is inherited at task creation time from the parent
+task. This accounting ID can not be changed without the task manager
+capability.
+
+The \texttt{proc} server sets the accounting ID to the process ID
+(PID) of the task whenever a task registers itself with the
+\texttt{proc} server. This means that all tasks which do not register
+themself with the \texttt{proc} server will be grouped together with
+the first parent task that did. This allows to easily kill all
+unregistered tasks together with its registered parent.
+
+The \texttt{task} server does not interpret or use the accounting ID
+in any way.
+
+
+\section{Proxy Task Server}
+\label{proxytaskserver}
+
+The \texttt{task} server can be safely proxied, and the users of such
+a proxy task server can use it like the real \texttt{task} server,
+even though capabilities work a bit differently for the \texttt{task}
+server than for other servers.
+
+The problem exists because the proxy task server would hold the real
+task info capabilities for the task info capabilities that it provides
+to the proxied task. So if the proxy task server dies, all such task
+info capabilities would be released, and the tasks using the proxy
+task server would become insecure and open to attacks by imposters.
+
+However, this is not really a problem, because the proxy task server
+will also provide proxy objects for all task control capabilities. So
+it will be the only task which holds task control capabilities for the
+tasks that use it. When the proxy task server dies, all tasks that
+were created with it will be destroyed when these tak control
+capabilities are released. The proxy task server is a vital system
+component for the tasks that use it, just as the real \texttt{task}
+server is a vital system component for the whole system.
+
+
+\section{Scheduling}
+
+The task server is the natural place to implement a simple, initial
+scheduler for the Hurd. A first version can at least collect some
+information about the cpu time of a task and its threads. Later a
+proper scheduler has to be written that also has SMP support.
+
+The scheduler should run at a higher priority than normal threads.
+
+\begin{comment}
+ This might require that the whole task server must run at a higher
+ priority, which makes sense anyway.
+
+ Not much thought has been given to the scheduler so far. This is
+ work that still needs to be done.
+\end{comment}
+
+There is no way to get at the ``system time'' in L4, it is assumed
+that no time is spent in the kernel (which is mostly true). So system
+time will always be reported as $0.00$, or $0.01$.
+
+
diff --git a/doc/vmm.tex b/doc/vmm.tex
new file mode 100644
index 0000000..a41c31e
--- /dev/null
+++ b/doc/vmm.tex
@@ -0,0 +1,26 @@
+\chapter{Virtual Memory Management}
+
+Traditionally, monolithical kernels, but even kernels like Mach,
+provide a virtual memory management system in the kernel. All paging
+decisions are made by the kernel itself. This requires good
+heuristics. Smart paging decisions are often not possible because the
+kernel lacks the information about how the data is used.
+
+In the Hurd, paging will be done locally in each task. A physical
+memory server provides a number of guaranteed physical pages to tasks.
+It will also provide a number of excess pages (over-commit). The task
+might have to return any number of excess pages on short notice. If
+the task does not comply, all mappings are revoked (essentially
+killing the task).
+
+A problem arises when data has to be exchanged between a client and a
+server, and the server wants to have control over the content of the
+pages (for example, pass it on to other servers, like device drivers).
+The client can not map the pages directly into the servers address
+space, as it is not trusted. Container objects created in the
+physical memory server and mapped into the client and/or the servers
+address space will provide the necessary security features to allow
+this. This can be used for DMA and zero-copying in the data exchange
+between device drivers and (untrusted) user tasks.
+
+