patch-2.3.51 linux/ipc/shm.c

Next file: linux/ipc/util.c
Previous file: linux/init/main.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.50/linux/ipc/shm.c linux/ipc/shm.c
@@ -12,6 +12,16 @@
  * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
  *                         Christoph Rohland <hans-christoph.rohland@sap.com>
  * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
+ * make it a file system,  Christoph Rohland <hans-christoph.rohland@sap.com>
+ *
+ * The filesystem has the following restrictions/bugs:
+ * 1) It only can handle one directory.
+ * 2) Because the directory is represented by the SYSV shm array it
+ *    can only be mounted one time.
+ * 3) This again leads to SYSV shm not working properly in a chrooted
+ *    environment
+ * 4) Read and write are not implemented (should they?)
+ * 5) No special nodes are supported
  */
 
 #include <linux/config.h>
@@ -20,6 +30,9 @@
 #include <linux/swap.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
+#include <linux/locks.h>
+#include <linux/file.h>
+#include <linux/mman.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
@@ -30,23 +43,61 @@
 
 #include "util.h"
 
+static struct super_block *shm_read_super(struct super_block *,void *, int);
+static void	      shm_put_super  (struct super_block *);
+static int	      shm_remount_fs (struct super_block *, int *, char *);
+static void	      shm_read_inode (struct inode *);
+static void	      shm_write_inode(struct inode *);
+static int	      shm_statfs (struct super_block *, struct statfs *);
+static int	      shm_create   (struct inode *,struct dentry *,int);
+static struct dentry *shm_lookup   (struct inode *,struct dentry *);
+static int	      shm_unlink   (struct inode *,struct dentry *);
+static int	      shm_setattr  (struct dentry *dent, struct iattr *attr);
+static void	      shm_delete   (struct inode *);
+static int	      shm_mmap	   (struct file *, struct vm_area_struct *);
+static int	      shm_readdir  (struct file *, void *, filldir_t);
+
+char shm_path[256] = "/var/shm";
+
+#define SHM_NAME_LEN NAME_MAX
+#define SHM_FMT ".IPC_%08x"
+#define SHM_FMT_LEN 13
+
 struct shmid_kernel /* private to the kernel */
 {	
 	struct kern_ipc_perm	shm_perm;
 	size_t			shm_segsz;
-	time_t			shm_atime;
-	time_t			shm_dtime;
-	time_t			shm_ctime;
-	pid_t			shm_cpid;
-	pid_t			shm_lpid;
 	unsigned long		shm_nattch;
 	unsigned long		shm_npages; /* size of segment (pages) */
-	pte_t			**shm_dir;  /* ptr to array of ptrs to frames -> SHMMAX */ 
-	struct vm_area_struct	*attaches;  /* descriptors for attaches */
-	int                     id; /* backreference to id for shm_close */
-	struct semaphore sem;
+	pte_t			**shm_dir;  /* ptr to arr of ptrs to frames */ 
+	int			id;
+	union permap {
+		struct shmem {
+			time_t			atime;
+			time_t			dtime;
+			time_t			ctime;
+			pid_t			cpid;
+			pid_t			lpid;
+			int			nlen;
+			char			nm[0];
+		} shmem;
+		struct zero {
+			struct semaphore	sema;
+			struct list_head	list;
+		} zero;
+	} permap;
 };
 
+#define shm_atim	permap.shmem.atime
+#define shm_dtim	permap.shmem.dtime
+#define shm_ctim	permap.shmem.ctime
+#define shm_cprid	permap.shmem.cpid
+#define shm_lprid	permap.shmem.lpid
+#define shm_namelen	permap.shmem.nlen
+#define shm_name	permap.shmem.nm
+#define zsem		permap.zero.sema
+#define zero_list	permap.zero.list
+
 static struct ipc_ids shm_ids;
 
 #define shm_lock(id)	((struct shmid_kernel*)ipc_lock(&shm_ids,id))
@@ -60,9 +111,8 @@
 #define shm_buildid(id, seq) \
 	ipc_buildid(&shm_ids, id, seq)
 
-static int newseg (key_t key, int shmflg, size_t size);
-static int shm_map (struct vm_area_struct *shmd);
-static void killseg (int shmid);
+static int newseg (key_t key, const char *name, int namelen, int shmflg, size_t size);
+static void killseg_core(struct shmid_kernel *shp, int doacc);
 static void shm_open (struct vm_area_struct *shmd);
 static void shm_close (struct vm_area_struct *shmd);
 static struct page * shm_nopage(struct vm_area_struct *, unsigned long, int);
@@ -75,12 +125,57 @@
 static void zmap_unuse(swp_entry_t entry, struct page *page);
 static void shmzero_open(struct vm_area_struct *shmd);
 static void shmzero_close(struct vm_area_struct *shmd);
+static struct page *shmzero_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share);
 static int zero_id;
 static struct shmid_kernel zshmid_kernel;
+static struct dentry *zdent;
+
+#define SHM_FS_MAGIC 0x02011994
+
+static struct super_block * shm_sb;
+
+static DECLARE_FSTYPE(shm_fs_type, "shm", shm_read_super, 0);
+
+static struct super_operations shm_sops = {
+	read_inode:	shm_read_inode,
+	write_inode:	shm_write_inode,
+	delete_inode:	shm_delete,
+	put_super:	shm_put_super,
+	statfs:		shm_statfs,
+	remount_fs:	shm_remount_fs,
+};
+
+static struct file_operations shm_root_operations = {
+	readdir:	shm_readdir,
+};
+
+static struct inode_operations shm_root_inode_operations = {
+	create:		shm_create,
+	lookup:		shm_lookup,
+	unlink:		shm_unlink,
+};
+
+static struct file_operations shm_file_operations = {
+	mmap:	shm_mmap,
+};
+
+static struct inode_operations shm_inode_operations = {
+	setattr:	shm_setattr,
+};
+
+static struct vm_operations_struct shm_vm_ops = {
+	open:	shm_open,	/* callback for a new vm-area open */
+	close:	shm_close,	/* callback for when the vm-area is released */
+	nopage:	shm_nopage,
+	swapout:shm_swapout,
+};
 
 size_t shm_ctlmax = SHMMAX;
-int shm_ctlall = SHMALL;
-int shm_ctlmni = SHMMNI;
+
+/* These parameters should be part of the superblock */
+static int shm_ctlall;
+static int shm_ctlmni;
+static int shm_mode;
 
 static int shm_tot = 0; /* total number of shared memory pages */
 static int shm_rss = 0; /* number of shared memory pages that are in memory */
@@ -90,7 +185,7 @@
 	pagecache_lock
 	shm_lock()/shm_lockall()
 	kernel lock
-	shp->sem
+	inode->i_sem
 	sem_ids.sem
 	mmap_sem
 
@@ -104,18 +199,318 @@
 /* some statistics */
 static ulong swap_attempts = 0;
 static ulong swap_successes = 0;
+static ulong used_segs = 0;
 
 void __init shm_init (void)
 {
-	ipc_init_ids(&shm_ids, shm_ctlmni);
+	ipc_init_ids(&shm_ids, 1);
+
+	register_filesystem (&shm_fs_type);
 #ifdef CONFIG_PROC_FS
 	create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
 #endif
-	zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, shm_ctlmni);
+	zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, 1);
 	shm_unlock(zero_id);
+	INIT_LIST_HEAD(&zshmid_kernel.zero_list);
+	zdent = d_alloc_root(get_empty_inode());
 	return;
 }
 
+static int shm_parse_options(char *options)
+{
+	int blocks = shm_ctlall;
+	int inodes = shm_ctlmni;
+	umode_t mode = shm_mode;
+	char *this_char, *value;
+
+	this_char = NULL;
+	if ( options )
+		this_char = strtok(options,",");
+	for ( ; this_char; this_char = strtok(NULL,",")) {
+		if ((value = strchr(this_char,'=')) != NULL)
+			*value++ = 0;
+		if (!strcmp(this_char,"nr_blocks")) {
+			if (!value || !*value)
+				return 1;
+			blocks = simple_strtoul(value,&value,0);
+			if (*value)
+				return 1;
+		}
+		else if (!strcmp(this_char,"nr_inodes")) {
+			if (!value || !*value)
+				return 1;
+			inodes = simple_strtoul(value,&value,0);
+			if (*value)
+				return 1;
+		}
+		else if (!strcmp(this_char,"mode")) {
+			if (!value || !*value)
+				return 1;
+			mode = simple_strtoul(value,&value,8);
+			if (*value)
+				return 1;
+		}
+		else
+			return 1;
+	}
+	shm_ctlmni = inodes;
+	shm_ctlall = blocks;
+	shm_mode   = mode;
+
+	return 0;
+}
+
+static struct super_block *shm_read_super(struct super_block *s,void *data, 
+					  int silent)
+{
+	struct inode * root_inode;
+
+	if (shm_sb) {
+		printk ("shm fs already mounted\n");
+		return NULL;
+	}
+
+	shm_ctlall = SHMALL;
+	shm_ctlmni = SHMMNI;
+	shm_mode   = S_IRWXUGO | S_ISVTX;
+	if (shm_parse_options (data)) {
+		printk ("shm fs invalid option\n");
+		goto out_unlock;
+	}
+
+	s->s_blocksize = PAGE_SIZE;
+	s->s_blocksize_bits = PAGE_SHIFT;
+	s->s_magic = SHM_FS_MAGIC;
+	s->s_op = &shm_sops;
+	root_inode = iget (s, SEQ_MULTIPLIER);
+	if (!root_inode)
+		goto out_no_root;
+	root_inode->i_op = &shm_root_inode_operations;
+	root_inode->i_sb = s;
+	root_inode->i_nlink = 2;
+	root_inode->i_mode = S_IFDIR | shm_mode;
+	s->s_root = d_alloc_root(root_inode);
+	if (!s->s_root)
+		goto out_no_root;
+	s->u.generic_sbp = (void*) shm_sb;
+	shm_sb = s;
+	return s;
+
+out_no_root:
+	printk("proc_read_super: get root inode failed\n");
+	iput(root_inode);
+out_unlock:
+	return NULL;
+}
+
+static int shm_remount_fs (struct super_block *sb, int *flags, char *data)
+{
+	if (shm_parse_options (data))
+		return -EINVAL;
+	return 0;
+}
+
+static void shm_put_super(struct super_block *sb)
+{
+	struct super_block **p = &shm_sb;
+	int i;
+	struct shmid_kernel *shp;
+
+	while (*p != sb) {
+		if (!*p)	/* should never happen */
+			return;
+		p = (struct super_block **)&(*p)->u.generic_sbp;
+	}
+	*p = (struct super_block *)(*p)->u.generic_sbp;
+	down(&shm_ids.sem);
+	for(i = 0; i <= shm_ids.max_id; i++) {
+		if (i == zero_id)
+			continue;
+		if (!(shp = shm_lock (i)))
+			continue;
+		if (shp->shm_nattch)
+			printk ("shm_nattch = %ld\n", shp->shm_nattch);
+		shp = shm_rmid(i);
+		shm_unlock(i);
+		killseg_core(shp, 1);
+	}
+	dput (sb->s_root);
+	up(&shm_ids.sem);
+}
+
+static int shm_statfs(struct super_block *sb, struct statfs *buf)
+{
+	buf->f_type = 0;
+	buf->f_bsize = PAGE_SIZE;
+	buf->f_blocks = shm_ctlall;
+	buf->f_bavail = buf->f_bfree = shm_ctlall - shm_tot;
+	buf->f_files = shm_ctlmni;
+	buf->f_ffree = shm_ctlmni - used_segs;
+	buf->f_namelen = SHM_NAME_LEN;
+	return 0;
+}
+
+static void shm_write_inode(struct inode * inode)
+{
+}
+
+static void shm_read_inode(struct inode * inode)
+{
+	int id;
+	struct shmid_kernel *shp;
+
+	id = inode->i_ino;
+	inode->i_op = NULL;
+	inode->i_mode = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	if (id < SEQ_MULTIPLIER) {
+		if (!(shp = shm_lock (id)))
+			return;
+		inode->i_mode = shp->shm_perm.mode | S_IFREG;
+		inode->i_uid  = shp->shm_perm.uid;
+		inode->i_gid  = shp->shm_perm.gid;
+		inode->i_size = shp->shm_segsz;
+		shm_unlock (id);
+		inode->i_op  = &shm_inode_operations;
+		inode->i_fop = &shm_file_operations;
+		return;
+	}
+	inode->i_op    = &shm_root_inode_operations;
+	inode->i_fop   = &shm_root_operations;
+	inode->i_sb    = shm_sb;
+	inode->i_nlink = 2;
+	inode->i_mode  = S_IFDIR | shm_mode;
+	inode->i_uid   = inode->i_gid = 0;
+
+}
+
+static int shm_create (struct inode *dir, struct dentry *dent, int mode)
+{
+	int id, err;
+	struct inode * inode;
+
+	down(&shm_ids.sem);
+	err = id = newseg (IPC_PRIVATE, dent->d_name.name, dent->d_name.len, mode, 0);
+	if (err < 0)
+		goto out;
+
+	err = -ENOMEM;
+	inode = iget (shm_sb, id % SEQ_MULTIPLIER);
+	if (!inode)
+		goto out;
+
+	err = 0;
+	down (&inode->i_sem);
+	inode->i_mode = mode | S_IFREG;
+	inode->i_op   = &shm_inode_operations;
+	d_instantiate(dent, inode);
+	up (&inode->i_sem);
+
+out:
+	up(&shm_ids.sem);
+	return err;
+}
+
+static int shm_readdir (struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode * inode = filp->f_dentry->d_inode;
+	struct shmid_kernel *shp;
+	off_t nr;
+
+	nr = filp->f_pos;
+
+	switch(nr)
+	{
+	case 0:
+		if (filldir(dirent, ".", 1, nr, inode->i_ino) < 0)
+			return 0;
+		filp->f_pos = ++nr;
+		/* fall through */
+	case 1:
+		if (filldir(dirent, "..", 2, nr, inode->i_ino) < 0)
+			return 0;
+		filp->f_pos = ++nr;
+		/* fall through */
+	default:
+		down(&shm_ids.sem);
+		for (; nr-2 <= shm_ids.max_id; nr++ ) {
+			if (!(shp = shm_get (nr-2))) 
+				continue;
+			if (shp->shm_perm.mode & SHM_DEST)
+				continue;
+			if (filldir(dirent, shp->shm_name, shp->shm_namelen, nr, nr) < 0 )
+				break;;
+		}
+		filp->f_pos = nr;
+		up(&shm_ids.sem);
+		break;
+	}
+
+	UPDATE_ATIME(inode);
+	return 0;
+}
+
+static struct dentry *shm_lookup (struct inode *dir, struct dentry *dent)
+{
+	int i, err = 0;
+	struct shmid_kernel* shp;
+	struct inode *inode = NULL;
+
+	if (dent->d_name.len > SHM_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	down(&shm_ids.sem);
+	for(i = 0; i <= shm_ids.max_id; i++) {
+		if (!(shp = shm_lock(i)))
+		    continue;
+		if (!(shp->shm_perm.mode & SHM_DEST) &&
+		    dent->d_name.len == shp->shm_namelen &&
+		    strncmp(dent->d_name.name, shp->shm_name, shp->shm_namelen) == 0)
+			goto found;
+		shm_unlock(i);
+	}
+
+	/*
+	 * prevent the reserved names as negative dentries. 
+	 * This also prevents object creation through the filesystem
+	 */
+	if (dent->d_name.len == SHM_FMT_LEN &&
+	    memcmp (SHM_FMT, dent->d_name.name, SHM_FMT_LEN - 8) == 0)
+		err = -EINVAL;	/* EINVAL to give IPC_RMID the right error */
+
+	goto out;
+
+found:
+	shm_unlock(i);
+	inode = iget(dir->i_sb, i);
+
+	if (!inode)
+		err = -EACCES;
+out:
+	if (err == 0)
+		d_add (dent, inode);
+	up (&shm_ids.sem);
+	return ERR_PTR(err);
+}
+
+static int shm_unlink (struct inode *dir, struct dentry *dent)
+{
+	struct inode * inode = dent->d_inode;
+	struct shmid_kernel *shp;
+
+	down (&shm_ids.sem);
+	if (!(shp = shm_lock (inode->i_ino)))
+		BUG();
+	shp->shm_perm.mode |= SHM_DEST;
+	shp->shm_perm.key = IPC_PRIVATE; /* Do not find it any more */
+	shm_unlock (inode->i_ino);
+	up (&shm_ids.sem);
+	inode->i_nlink -= 1;
+	d_delete (dent);
+	return 0;
+}
+
 #define SHM_ENTRY(shp, index) (shp)->shm_dir[(index)/PTRS_PER_PTE][(index)%PTRS_PER_PTE]
 
 static pte_t **shm_alloc(unsigned long pages)
@@ -124,9 +519,12 @@
 	unsigned short last = pages % PTRS_PER_PTE;
 	pte_t **ret, **ptr;
 
+	if (pages == 0)
+		return NULL;
+
 	ret = kmalloc ((dir+1) * sizeof(pte_t *), GFP_KERNEL);
 	if (!ret)
-		goto out;
+		goto nomem;
 
 	for (ptr = ret; ptr < ret+dir ; ptr++)
 	{
@@ -143,7 +541,6 @@
 			goto free;
 		memset (*ptr, 0, last*sizeof(pte_t));
 	}
-out:	
 	return ret;
 
 free:
@@ -152,48 +549,90 @@
 		free_page ((unsigned long)*ptr);
 
 	kfree (ret);
-	return NULL;
+nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
-
 static void shm_free(pte_t** dir, unsigned long pages)
 {
 	pte_t **ptr = dir+pages/PTRS_PER_PTE;
 
+	if (!dir)
+		return;
+
 	/* first the last page */
 	if (pages%PTRS_PER_PTE)
 		kfree (*ptr);
 	/* now the whole pages */
 	while (--ptr >= dir)
-		free_page ((unsigned long)*ptr);
+		if (*ptr)
+			free_page ((unsigned long)*ptr);
 
 	/* Now the indirect block */
 	kfree (dir);
 }
 
-static int shm_revalidate(struct shmid_kernel* shp, int shmid, int pagecount, int flg)
+static 	int shm_setattr (struct dentry *dentry, struct iattr *attr)
 {
-	struct shmid_kernel* new;
-	new = shm_lock(shmid);
-	if(new==NULL) {
-		return -EIDRM;
-	}
-	if(new!=shp || shm_checkid(shp, shmid) || shp->shm_npages != pagecount) {
-		shm_unlock(shmid);
-		return -EIDRM;
-	}
-	if (ipcperms(&shp->shm_perm, flg)) {
-		shm_unlock(shmid);
-		return -EACCES;
+	int error;
+	struct inode *inode = dentry->d_inode;
+	struct shmid_kernel *shp;
+	unsigned long new_pages, old_pages;
+	pte_t **new_dir, **old_dir;
+
+	if ((error = inode_change_ok(inode, attr)))
+		return error;
+	if (!(attr->ia_valid & ATTR_SIZE))
+		goto set_attr;
+	if (attr->ia_size > shm_ctlmax)
+		return -EFBIG;
+
+	/* We set old_pages and old_dir for easier cleanup */
+	old_pages = new_pages = (attr->ia_size  + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (shm_tot + new_pages >= shm_ctlall)
+		return -ENOSPC;
+	if (IS_ERR(old_dir = new_dir = shm_alloc(new_pages)))
+		return PTR_ERR(new_dir);
+
+	if (!(shp = shm_lock(inode->i_ino)))
+		BUG();
+	if (shp->shm_segsz == attr->ia_size)
+		goto out;
+	old_dir = shp->shm_dir;
+	old_pages = shp->shm_npages;
+	if (old_dir){
+		pte_t *swap;
+		int i,j;
+		i = old_pages < new_pages ? old_pages : new_pages;
+		j = i % PTRS_PER_PTE;
+		i /= PTRS_PER_PTE;
+		if (j)
+			memcpy (new_dir[i], old_dir[i], j * sizeof (pte_t));
+		while (i--) {
+			swap = new_dir[i];
+			new_dir[i] = old_dir[i];
+			old_dir[i] = swap;
+		}
 	}
+	shp->shm_dir = new_dir;
+	shp->shm_npages = new_pages;
+	shp->shm_segsz = attr->ia_size;
+out:
+	shm_unlock(inode->i_ino);
+	shm_lockall();
+	shm_tot += new_pages - old_pages;
+	shm_unlockall();
+	shm_free (old_dir, old_pages);
+set_attr:
+	inode_setattr(inode, attr);
 	return 0;
 }
 
-static inline struct shmid_kernel *newseg_alloc(int numpages)
+static inline struct shmid_kernel *newseg_alloc(int numpages, size_t namelen)
 {
 	struct shmid_kernel *shp;
 
-	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
+	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp) + namelen, GFP_KERNEL);
 	if (!shp)
 		return 0;
 
@@ -203,29 +642,29 @@
 		return 0;
 	}
 	shp->shm_npages = numpages;
-	shp->attaches = NULL;
 	shp->shm_nattch = 0;
-	init_MUTEX(&shp->sem);
+	shp->shm_namelen = namelen;
 	return(shp);
 }
 
-static int newseg (key_t key, int shmflg, size_t size)
+static int newseg (key_t key, const char *name, int namelen,
+		   int shmflg, size_t size)
 {
 	struct shmid_kernel *shp;
 	int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
 	int id;
 
-	if (size < SHMMIN)
-		return -EINVAL;
+	if (namelen > SHM_NAME_LEN)
+		return -ENAMETOOLONG;
 
 	if (size > shm_ctlmax)
 		return -EINVAL;
 	if (shm_tot + numpages >= shm_ctlall)
 		return -ENOSPC;
 
-	if (!(shp = newseg_alloc(numpages)))
+	if (!(shp = newseg_alloc(numpages, namelen ? namelen : SHM_FMT_LEN + 1)))
 		return -ENOMEM;
-	id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni);
+	id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni+1);
 	if(id == -1) {
 		shm_free(shp->shm_dir,numpages);
 		kfree(shp);
@@ -234,16 +673,23 @@
 	shp->shm_perm.key = key;
 	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
 	shp->shm_segsz = size;
-	shp->shm_cpid = current->pid;
-	shp->shm_lpid = 0;
-	shp->shm_atime = shp->shm_dtime = 0;
-	shp->shm_ctime = CURRENT_TIME;
+	shp->shm_cprid = current->pid;
+	shp->shm_lprid = 0;
+	shp->shm_atim = shp->shm_dtim = 0;
+	shp->shm_ctim = CURRENT_TIME;
 	shp->id = shm_buildid(id,shp->shm_perm.seq);
+	if (namelen != 0) {
+		shp->shm_namelen = namelen;
+		memcpy (shp->shm_name, name, namelen);		  
+	} else {
+		shp->shm_namelen = sprintf (shp->shm_name, SHM_FMT, shp->id);
+	}
 
 	shm_tot += numpages;
+	used_segs++;
 	shm_unlock(id);
-
-	return shm_buildid(id,shp->shm_perm.seq);
+	
+	return shp->id;
 }
 
 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
@@ -251,21 +697,31 @@
 	struct shmid_kernel *shp;
 	int err, id = 0;
 
+	if (!shm_sb) {
+		printk ("shmget: shm filesystem not mounted\n");
+		return -EINVAL;
+	}
+
+	if (size < SHMMIN)
+		return -EINVAL;
+
 	down(&shm_ids.sem);
 	if (key == IPC_PRIVATE) {
-		err = newseg(key, shmflg, size);
+		err = newseg(key, NULL, 0, shmflg, size);
 	} else if ((id = ipc_findkey(&shm_ids,key)) == -1) {
 		if (!(shmflg & IPC_CREAT))
 			err = -ENOENT;
 		else
-			err = newseg(key, shmflg, size);
+			err = newseg(key, NULL, 0, shmflg, size);
 	} else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
 		err = -EEXIST;
 	} else {
 		shp = shm_lock(id);
 		if(shp==NULL)
 			BUG();
-		if (ipcperms(&shp->shm_perm, shmflg))
+		if (shp->shm_segsz < size)
+			err = -EINVAL;
+		else if (ipcperms(&shp->shm_perm, shmflg))
 			err = -EACCES;
 		else
 			err = shm_buildid(id, shp->shm_perm.seq);
@@ -300,40 +756,26 @@
 		shm_rss -= rss;
 		shm_swp -= swp;
 		shm_tot -= numpages;
+		used_segs--;
 		shm_unlockall();
 	}
 }
 
-/*
- * Only called after testing nattch and SHM_DEST.
- * Here pages, pgtable and shmid_kernel are freed.
- */
-static void killseg (int shmid)
+static void shm_delete (struct inode *ino)
 {
+	int shmid = ino->i_ino;
 	struct shmid_kernel *shp;
 
 	down(&shm_ids.sem);
 	shp = shm_lock(shmid);
 	if(shp==NULL) {
-out_up:
-		up(&shm_ids.sem);
-		return;
-	}
-	if(shm_checkid(shp,shmid) || shp->shm_nattch > 0 ||
-	    !(shp->shm_perm.mode & SHM_DEST)) {
-		shm_unlock(shmid);
-		goto out_up;
+		BUG();
 	}
 	shp = shm_rmid(shmid);
-	if(shp==NULL)
-		BUG();
-	if (!shp->shm_dir)
-		BUG();
 	shm_unlock(shmid);
 	up(&shm_ids.sem);
 	killseg_core(shp, 1);
-
-	return;
+	clear_inode(ino);
 }
 
 static inline unsigned long copy_shmid_to_user(void *buf, struct shmid64_ds *in, int version)
@@ -427,12 +869,29 @@
 	}
 }
 
+char * shm_getname(int id)
+{
+	char *result;
+
+	result = __getname ();
+	if (IS_ERR(result))
+		return result;
+
+	sprintf (result, "%s/" SHM_FMT, shm_path, id); 
+	return result;
+}
+
 asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
 {
 	struct shm_setbuf setbuf;
 	struct shmid_kernel *shp;
 	int err, version;
 
+	if (!shm_sb) {
+		printk ("shmctl: shm filesystem not mounted\n");
+		return -EINVAL;
+	}
+
 	if (cmd < 0 || shmid < 0)
 		return -EINVAL;
 
@@ -481,14 +940,12 @@
 	{
 		struct shmid64_ds tbuf;
 		int result;
+		if ((shmid % SEQ_MULTIPLIER) == zero_id)
+			return -EINVAL;
 		memset(&tbuf, 0, sizeof(tbuf));
 		shp = shm_lock(shmid);
 		if(shp==NULL)
 			return -EINVAL;
-		if (shp == &zshmid_kernel) {
-			shm_unlock(shmid);
-			return -EINVAL;
-		}
 		if(cmd==SHM_STAT) {
 			err = -EINVAL;
 			if (shmid > shm_ids.max_id)
@@ -505,11 +962,11 @@
 			goto out_unlock;
 		kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
 		tbuf.shm_segsz	= shp->shm_segsz;
-		tbuf.shm_atime	= shp->shm_atime;
-		tbuf.shm_dtime	= shp->shm_dtime;
-		tbuf.shm_ctime	= shp->shm_ctime;
-		tbuf.shm_cpid	= shp->shm_cpid;
-		tbuf.shm_lpid	= shp->shm_lpid;
+		tbuf.shm_atime	= shp->shm_atim;
+		tbuf.shm_dtime	= shp->shm_dtim;
+		tbuf.shm_ctime	= shp->shm_ctim;
+		tbuf.shm_cpid	= shp->shm_cprid;
+		tbuf.shm_lpid	= shp->shm_lprid;
 		tbuf.shm_nattch	= shp->shm_nattch;
 		shm_unlock(shmid);
 		if(copy_shmid_to_user (buf, &tbuf, version))
@@ -523,16 +980,14 @@
 /* Should the pages be faulted in here or leave it to user? */
 /* need to determine interaction with current->swappable */
 		struct kern_ipc_perm *ipcp;
+		if ((shmid % SEQ_MULTIPLIER)== zero_id)
+			return -EINVAL;
 		if (!capable(CAP_IPC_LOCK))
 			return -EPERM;
 
 		shp = shm_lock(shmid);
 		if(shp==NULL)
 			return -EINVAL;
-		if (shp == &zshmid_kernel) {
-			shm_unlock(shmid);
-			return -EINVAL;
-		}
 		err=-EIDRM;
 		if(shm_checkid(shp,shmid))
 			goto out_unlock;
@@ -552,50 +1007,56 @@
 		return err;
 	}
 	case IPC_RMID:
-	case IPC_SET:
-		break;
-	default:
-		return -EINVAL;
+	{
+		char *name;
+		if ((shmid % SEQ_MULTIPLIER)== zero_id)
+			return -EINVAL;
+		name = shm_getname(shmid);
+		if (IS_ERR(name))
+			return PTR_ERR(name);
+		lock_kernel();
+		err = do_unlink (name);
+		unlock_kernel();
+		putname (name);
+		if (err == -ENOENT)
+			err = -EINVAL;
+		return err;
 	}
 
-	if (cmd == IPC_SET) {
+	case IPC_SET:
+	{
+		if ((shmid % SEQ_MULTIPLIER)== zero_id)
+			return -EINVAL;
+
 		if(copy_shmid_from_user (&setbuf, buf, version))
 			return -EFAULT;
-	}
-	down(&shm_ids.sem);
-	shp = shm_lock(shmid);
-	err=-EINVAL;
-	if(shp==NULL)
-		goto out_up;
-	if (shp == &zshmid_kernel)
-		goto out_unlock_up;
-	err=-EIDRM;
-	if(shm_checkid(shp,shmid))
-		goto out_unlock_up;
-	err=-EPERM;
-	if (current->euid != shp->shm_perm.uid &&
-	    current->euid != shp->shm_perm.cuid && 
-	    !capable(CAP_SYS_ADMIN)) {
-		goto out_unlock_up;
-	}
+		down(&shm_ids.sem);
+		shp = shm_lock(shmid);
+		err=-EINVAL;
+		if(shp==NULL)
+			goto out_up;
+		err=-EIDRM;
+		if(shm_checkid(shp,shmid))
+			goto out_unlock_up;
+		err=-EPERM;
+		if (current->euid != shp->shm_perm.uid &&
+		    current->euid != shp->shm_perm.cuid && 
+		    !capable(CAP_SYS_ADMIN)) {
+			goto out_unlock_up;
+		}
 
-	switch (cmd) {
-	case IPC_SET:
 		shp->shm_perm.uid = setbuf.uid;
 		shp->shm_perm.gid = setbuf.gid;
 		shp->shm_perm.mode = (shp->shm_perm.mode & ~S_IRWXUGO)
 			| (setbuf.mode & S_IRWXUGO);
-		shp->shm_ctime = CURRENT_TIME;
+		shp->shm_ctim = CURRENT_TIME;
 		break;
-	case IPC_RMID:
-		shp->shm_perm.mode |= SHM_DEST;
-		if (shp->shm_nattch <= 0) {
-			shm_unlock(shmid);
-			up(&shm_ids.sem);
-			killseg (shmid);
-			return 0;
-		}
 	}
+
+	default:
+		return -EINVAL;
+	}
+
 	err = 0;
 out_unlock_up:
 	shm_unlock(shmid);
@@ -607,65 +1068,24 @@
 	return err;
 }
 
-/*
- * The per process internal structure for managing segments is
- * `struct vm_area_struct'.
- * A shmat will add to and shmdt will remove from the list.
- * shmd->vm_mm		the attacher
- * shmd->vm_start	virt addr of attach, multiple of SHMLBA
- * shmd->vm_end		multiple of SHMLBA
- * shmd->vm_next	next attach for task
- * shmd->vm_next_share	next attach for segment
- * shmd->vm_pgoff	offset into segment (in pages)
- * shmd->vm_private_data		signature for this attach
- */
-
-static struct vm_operations_struct shm_vm_ops = {
-	open:		shm_open,	/* open - callback for a new vm-area open */
-	close:		shm_close,	/* close - callback for when the vm-area is released */
-	nopage:		shm_nopage,
-	swapout:	shm_swapout,
-};
-
-/* Insert shmd into the list shp->attaches */
-static inline void insert_attach (struct shmid_kernel * shp, struct vm_area_struct * shmd)
-{
-	if((shmd->vm_next_share = shp->attaches) != NULL)
-		shp->attaches->vm_pprev_share = &shmd->vm_next_share;
-	shp->attaches = shmd;
-	shmd->vm_pprev_share = &shp->attaches;
-}
+static inline void shm_inc (int id) {
+	struct shmid_kernel *shp;
 
-/* Remove shmd from list shp->attaches */
-static inline void remove_attach (struct shmid_kernel * shp, struct vm_area_struct * shmd)
-{
-	if(shmd->vm_next_share)
-		shmd->vm_next_share->vm_pprev_share = shmd->vm_pprev_share;
-	*shmd->vm_pprev_share = shmd->vm_next_share;
+	if(!(shp = shm_lock(id)))
+		BUG();
+	shp->shm_atim = CURRENT_TIME;
+	shp->shm_lprid = current->pid;
+	shp->shm_nattch++;
+	shm_unlock(id);
 }
 
-/*
- * ensure page tables exist
- * mark page table entries with shm_sgn.
- */
-static int shm_map (struct vm_area_struct *shmd)
+static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 {
-	unsigned long tmp;
-
-	/* clear old mappings */
-	do_munmap(shmd->vm_start, shmd->vm_end - shmd->vm_start);
-
-	/* add new mapping */
-	tmp = shmd->vm_end - shmd->vm_start;
-	if((current->mm->total_vm << PAGE_SHIFT) + tmp
-	   > (unsigned long) current->rlim[RLIMIT_AS].rlim_cur)
-		return -ENOMEM;
-	current->mm->total_vm += tmp >> PAGE_SHIFT;
-	vmlist_modify_lock(current->mm);
-	insert_vm_struct(current->mm, shmd);
-	merge_segments(current->mm, shmd->vm_start, shmd->vm_end);
-	vmlist_modify_unlock(current->mm);
-
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL; /* we cannot do private mappings */
+	UPDATE_ATIME(file->f_dentry->d_inode);
+	vma->vm_ops = &shm_vm_ops;
+	shm_inc(file->f_dentry->d_inode->i_ino);
 	return 0;
 }
 
@@ -674,137 +1094,57 @@
  */
 asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
 {
-	struct shmid_kernel *shp;
-	struct vm_area_struct *shmd;
-	int err;
 	unsigned long addr;
-	unsigned long len;
-	short flg = shmflg & SHM_RDONLY ? S_IRUGO : S_IRUGO|S_IWUGO;
-
+	struct file * file;
+	int    err;
+	int    flags;
+	char   *name;
 
-	if (shmid < 0)
+	if (!shm_sb || (shmid % SEQ_MULTIPLIER) == zero_id)
 		return -EINVAL;
 
-	down(&current->mm->mmap_sem);
-	err = -EINVAL;
-	shp = shm_lock(shmid);
-	if (!shp)
-		goto out_up;
-	if (shp == &zshmid_kernel)
-		goto out_unlock_up;
-
-	err = -EACCES;
-	if (ipcperms(&shp->shm_perm, flg))
-		goto out_unlock_up;
-
-	err = -EIDRM;
-	if (shm_checkid(shp,shmid))
-		goto out_unlock_up;
-
-	if (!(addr = (ulong) shmaddr)) {
-		if (shmflg & SHM_REMAP)
-			goto out_unlock_up;
-		err = -ENOMEM;
-		addr = 0;
-	again:
-		if (!(addr = get_unmapped_area(addr, (unsigned long)shp->shm_segsz)))
-			goto out_unlock_up;
-		if(addr & (SHMLBA - 1)) {
-			addr = (addr + (SHMLBA - 1)) & ~(SHMLBA - 1);
-			goto again;
+	if ((addr = (ulong)shmaddr))
+	{
+		if(addr & (SHMLBA-1)) {
+			if (shmflg & SHM_RND)
+				addr &= ~(SHMLBA-1);	   /* round down */
+			else
+				return -EINVAL;
 		}
-	} else if (addr & (SHMLBA-1)) {
-		err=-EINVAL;
-		if (shmflg & SHM_RND)
-			addr &= ~(SHMLBA-1);       /* round down */
-		else
-			goto out_unlock_up;
-	}
-	/*
-	 * Check if addr exceeds TASK_SIZE (from do_mmap)
-	 */
-	len = PAGE_SIZE*shp->shm_npages;
-	err = -EINVAL;
-	if (addr >= TASK_SIZE || len > TASK_SIZE  || addr > TASK_SIZE - len)
-		goto out_unlock_up;
-	/*
-	 * If shm segment goes below stack, make sure there is some
-	 * space left for the stack to grow (presently 4 pages).
-	 */
-	if (addr < current->mm->start_stack &&
-	    addr > current->mm->start_stack - PAGE_SIZE*(shp->shm_npages + 4))
-		goto out_unlock_up;
-	if (!(shmflg & SHM_REMAP) && find_vma_intersection(current->mm, addr, addr + (unsigned long)shp->shm_segsz))
-		goto out_unlock_up;
-
-	shm_unlock(shmid);
-	err = -ENOMEM;
-	shmd = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
-	err = shm_revalidate(shp, shmid, len/PAGE_SIZE,flg);
-	if(err)	{
-		kmem_cache_free(vm_area_cachep, shmd);
-		goto out_up;
-	}
-
-	shmd->vm_private_data = shp;
-	shmd->vm_start = addr;
-	shmd->vm_end = addr + shp->shm_npages * PAGE_SIZE;
-	shmd->vm_mm = current->mm;
-	shmd->vm_page_prot = (shmflg & SHM_RDONLY) ? PAGE_READONLY : PAGE_SHARED;
-	shmd->vm_flags = VM_SHM | VM_MAYSHARE | VM_SHARED
-			 | VM_MAYREAD | VM_MAYEXEC | VM_READ | VM_EXEC
-			 | ((shmflg & SHM_RDONLY) ? 0 : VM_MAYWRITE | VM_WRITE);
-	shmd->vm_file = NULL;
-	shmd->vm_pgoff = 0;
-	shmd->vm_ops = &shm_vm_ops;
-
-	shp->shm_nattch++;	    /* prevent destruction */
-	shm_unlock(shp->id);
-	err = shm_map (shmd);
-	shm_lock(shmid); /* cannot fail */
-	if (err)
-		goto failed_shm_map;
-
-	insert_attach(shp,shmd);  /* insert shmd into shp->attaches */
-
-	shp->shm_lpid = current->pid;
-	shp->shm_atime = CURRENT_TIME;
+		flags = MAP_SHARED | MAP_FIXED;
+	} else
+		flags = MAP_SHARED;
 
-	*raddr = addr;
-	err = 0;
-out_unlock_up:
-	shm_unlock(shmid);
-out_up:
-	up(&current->mm->mmap_sem);
+	name = shm_getname(shmid);
+	if (IS_ERR (name))
+		return PTR_ERR (name);
+
+	file = filp_open (name, O_RDWR, 0);
+	putname (name);
+	if (IS_ERR (file))
+		goto bad_file;
+	lock_kernel();
+	*raddr = do_mmap (file, addr, file->f_dentry->d_inode->i_size,
+			  (shmflg & SHM_RDONLY ? PROT_READ :
+			   PROT_READ | PROT_WRITE), flags, 0);
+	unlock_kernel();
+	if (IS_ERR(*raddr))
+		err = PTR_ERR(*raddr);
+	else
+		err = 0;
+	fput (file);
 	return err;
 
-failed_shm_map:
-	{
-		int delete = 0;
-		if (--shp->shm_nattch <= 0 && shp->shm_perm.mode & SHM_DEST)
-			delete = 1;
-		shm_unlock(shmid);
-		up(&current->mm->mmap_sem);
-		kmem_cache_free(vm_area_cachep, shmd);
-		if(delete)
-			killseg(shmid);
-		return err;
-	}
+bad_file:
+	if ((err = PTR_ERR(file)) == -ENOENT)
+		return -EINVAL;
+	return err;
 }
 
 /* This is called by fork, once for every shm attach. */
 static void shm_open (struct vm_area_struct *shmd)
 {
-	struct shmid_kernel *shp;
-
-	shp = (struct shmid_kernel *) shmd->vm_private_data;
-	if(shp != shm_lock(shp->id))
-		BUG();
-	insert_attach(shp,shmd);  /* insert shmd into shp->attaches */
-	shp->shm_nattch++;
-	shp->shm_atime = CURRENT_TIME;
-	shp->shm_lpid = current->pid;
-	shm_unlock(shp->id);
+	shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino);
 }
 
 /*
@@ -815,22 +1155,16 @@
  */
 static void shm_close (struct vm_area_struct *shmd)
 {
+	int id = shmd->vm_file->f_dentry->d_inode->i_ino;
 	struct shmid_kernel *shp;
-	int id;
 
 	/* remove from the list of attaches of the shm segment */
-	shp = (struct shmid_kernel *) shmd->vm_private_data;
-	if(shp != shm_lock(shp->id))
+	if(!(shp = shm_lock(id)))
 		BUG();
-	remove_attach(shp,shmd);  /* remove from shp->attaches */
-  	shp->shm_lpid = current->pid;
-	shp->shm_dtime = CURRENT_TIME;
-	id=-1;
-	if (--shp->shm_nattch <= 0 && shp->shm_perm.mode & SHM_DEST)
-		id=shp->id;
-	shm_unlock(shp->id);
-	if(id!=-1)
-		killseg(id);
+	shp->shm_lprid = current->pid;
+	shp->shm_dtim = CURRENT_TIME;
+	shp->shm_nattch--;
+	shm_unlock(id);
 }
 
 /*
@@ -868,31 +1202,13 @@
 /*
  * page not present ... go through shm_dir
  */
-static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
+static struct page * shm_nopage_core(struct shmid_kernel *shp, unsigned int idx, int *swp, int *rss)
 {
 	pte_t pte;
-	struct shmid_kernel *shp;
-	unsigned int idx;
 	struct page * page;
-	int is_shmzero;
 
-	shp = (struct shmid_kernel *) shmd->vm_private_data;
-	idx = (address - shmd->vm_start) >> PAGE_SHIFT;
-	idx += shmd->vm_pgoff;
-	is_shmzero = (shp->id == zero_id);
-
-	/*
-	 * A shared mapping past the last page of the file is an error
-	 * and results in a SIGBUS, so logically a shared mapping past 
-	 * the end of a shared memory segment should result in SIGBUS
-	 * as well.
-	 */
-	if (idx >= shp->shm_npages) { 
-		return NULL;
-	}
-	down(&shp->sem);
-	if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
-		BUG();
+	if (idx >= shp->shm_npages)
+		goto sigbus;
 
 	pte = SHM_ENTRY(shp,idx);
 	if (!pte_present(pte)) {
@@ -905,7 +1221,7 @@
 			if (!page)
 				goto oom;
 			clear_highpage(page);
-			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
+			if ((shp != shm_lock(shp->id)) && (shp->id != zero_id))
 				BUG();
 		} else {
 			swp_entry_t entry = pte_to_swp_entry(pte);
@@ -923,11 +1239,11 @@
 			delete_from_swap_cache(page);
 			page = replace_with_highmem(page);
 			swap_free(entry);
-			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
+			if ((shp != shm_lock(shp->id)) && (shp->id != zero_id))
 				BUG();
-			if (is_shmzero == 0) shm_swp--;
+			(*swp)--;
 		}
-		if (is_shmzero == 0) shm_rss++;
+		(*rss)++;
 		pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
 		SHM_ENTRY(shp, idx) = pte;
 	} else
@@ -935,14 +1251,32 @@
 
 	/* pte_val(pte) == SHM_ENTRY (shp, idx) */
 	get_page(pte_page(pte));
-	shm_unlock(shp->id);
-	up(&shp->sem);
 	current->min_flt++;
 	return pte_page(pte);
 
 oom:
-	up(&shp->sem);
 	return NOPAGE_OOM;
+sigbus:
+	return NOPAGE_SIGBUS;
+}
+
+static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
+{
+	struct page * page;
+	struct shmid_kernel *shp;
+	unsigned int idx;
+	struct inode * inode = shmd->vm_file->f_dentry->d_inode;
+
+	idx = (address - shmd->vm_start) >> PAGE_SHIFT;
+	idx += shmd->vm_pgoff;
+
+	down(&inode->i_sem);
+	if(!(shp = shm_lock(inode->i_ino)))
+		BUG();
+	page = shm_nopage_core(shp, idx, &shm_swp, &shm_rss);
+	shm_unlock(inode->i_ino);
+	up(&inode->i_sem);
+	return(page);
 }
 
 #define OKAY	0
@@ -1127,38 +1461,40 @@
 	int i, len = 0;
 
 	down(&shm_ids.sem);
-	len += sprintf(buffer, "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n");
+	len += sprintf(buffer, "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime name\n");
 
-    	for(i = 0; i <= shm_ids.max_id; i++) {
-		struct shmid_kernel* shp = shm_lock(i);
-		if (shp == &zshmid_kernel) {
-			shm_unlock(i);
+	for(i = 0; i <= shm_ids.max_id; i++) {
+		struct shmid_kernel* shp;
+
+		if (i == zero_id)
 			continue;
-		}
+		shp = shm_lock(i);
 		if(shp!=NULL) {
-#define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
-#define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
+#define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu %.*s\n"
+#define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu %.*s\n"
 			char *format;
 
 			if (sizeof(size_t) <= sizeof(int))
 				format = SMALL_STRING;
 			else
 				format = BIG_STRING;
-	    		len += sprintf(buffer + len, format,
+			len += sprintf(buffer + len, format,
 				shp->shm_perm.key,
 				shm_buildid(i, shp->shm_perm.seq),
 				shp->shm_perm.mode,
 				shp->shm_segsz,
-				shp->shm_cpid,
-				shp->shm_lpid,
+				shp->shm_cprid,
+				shp->shm_lprid,
 				shp->shm_nattch,
 				shp->shm_perm.uid,
 				shp->shm_perm.gid,
 				shp->shm_perm.cuid,
 				shp->shm_perm.cgid,
-				shp->shm_atime,
-				shp->shm_dtime,
-				shp->shm_ctime);
+				shp->shm_atim,
+				shp->shm_dtim,
+				shp->shm_ctim,
+				shp->shm_namelen,
+				shp->shm_name);
 			shm_unlock(i);
 
 			pos += len;
@@ -1183,31 +1519,84 @@
 }
 #endif
 
-static struct shmid_kernel *zmap_list = 0;
+#define VMA_TO_SHP(vma)		((vma)->vm_file->private_data)
+
 static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
 static unsigned long zswap_idx = 0; /* next to swap */
-static struct shmid_kernel *zswap_shp = 0;
+static struct shmid_kernel *zswap_shp = &zshmid_kernel;
+static int zshm_rss;
 
 static struct vm_operations_struct shmzero_vm_ops = {
 	open:		shmzero_open,
 	close:		shmzero_close,
-	nopage:		shm_nopage,
+	nopage:		shmzero_nopage,
 	swapout:	shm_swapout,
 };
 
+/*
+ * In this implementation, the "unuse" and "swapout" interfaces are
+ * interlocked out via the kernel_lock, as well as shm_lock(zero_id).
+ * "unuse" and "nopage/swapin", as well as "swapout" and "nopage/swapin"
+ * interlock via shm_lock(zero_id). All these interlocks can be based
+ * on a per mapping lock instead of being a global lock.
+ */
+/*
+ * Reference (existance) counting on the file/dentry/inode is done
+ * by generic vm_file code. The zero code does not hold any reference 
+ * on the pseudo-file. This is possible because the open/close calls
+ * are bracketed by the file count update calls.
+ */
+static struct file *file_setup(struct file *fzero, struct shmid_kernel *shp)
+{
+	struct file *filp;
+	struct inode *inp;
+
+	if ((filp = get_empty_filp()) == 0)
+		return(filp);
+	if ((inp = get_empty_inode()) == 0) {
+		put_filp(filp);
+		return(0);
+	}
+	if ((filp->f_dentry = d_alloc(zdent, &(const struct qstr) { "dev/zero", 
+				8, 0 })) == 0) {
+		iput(inp);
+		put_filp(filp);
+		return(0);
+	}
+	d_instantiate(filp->f_dentry, inp);
+
+	/*
+	 * Copy over /dev/zero dev/ino for benefit of procfs. Use
+	 * ino to indicate seperate mappings.
+	 */
+	filp->f_dentry->d_inode->i_dev = fzero->f_dentry->d_inode->i_dev;
+	filp->f_dentry->d_inode->i_ino = (unsigned long)shp;
+	fput(fzero);	/* release /dev/zero file */
+	return(filp);
+}
+
 int map_zero_setup(struct vm_area_struct *vma)
 {
+	extern int vm_enough_memory(long pages);
 	struct shmid_kernel *shp;
+	struct file *filp;
 
-	if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE)))
+	if (!vm_enough_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))
+		return -ENOMEM;
+	if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE, 0)))
+		return -ENOMEM;
+	if ((filp = file_setup(vma->vm_file, shp)) == 0) {
+		killseg_core(shp, 0);
 		return -ENOMEM;
-	shp->id = zero_id;	/* hack for shm_lock et al */
-	vma->vm_private_data = shp;
+	}
+	vma->vm_file = filp;
+	VMA_TO_SHP(vma) = (void *)shp;
+	shp->id = zero_id;
+	init_MUTEX(&shp->zsem);
 	vma->vm_ops = &shmzero_vm_ops;
 	shmzero_open(vma);
 	spin_lock(&zmap_list_lock);
-	shp->attaches = (struct vm_area_struct *)zmap_list;
-	zmap_list = shp;
+	list_add(&shp->zero_list, &zshmid_kernel.zero_list);
 	spin_unlock(&zmap_list_lock);
 	return 0;
 }
@@ -1216,53 +1605,66 @@
 {
 	struct shmid_kernel *shp;
 
-	shp = (struct shmid_kernel *) shmd->vm_private_data;
-	down(&shp->sem);
+	shp = VMA_TO_SHP(shmd);
+	down(&shp->zsem);
 	shp->shm_nattch++;
-	up(&shp->sem);
+	up(&shp->zsem);
 }
 
 static void shmzero_close(struct vm_area_struct *shmd)
 {
 	int done = 0;
-	struct shmid_kernel *shp, *prev, *cur;
+	struct shmid_kernel *shp;
 
-	shp = (struct shmid_kernel *) shmd->vm_private_data;
-	down(&shp->sem);
+	shp = VMA_TO_SHP(shmd);
+	down(&shp->zsem);
 	if (--shp->shm_nattch == 0)
 		done = 1;
-	up(&shp->sem);
+	up(&shp->zsem);
 	if (done) {
 		spin_lock(&zmap_list_lock);
 		if (shp == zswap_shp)
-			zswap_shp = (struct shmid_kernel *)(shp->attaches);
-		if (shp == zmap_list)
-			zmap_list = (struct shmid_kernel *)(shp->attaches);
-		else {
-			prev = zmap_list;
-			cur = (struct shmid_kernel *)(prev->attaches);
-			while (cur != shp) {
-				prev = cur;
-				cur = (struct shmid_kernel *)(prev->attaches);
-			}
-			prev->attaches = (struct vm_area_struct *)(shp->attaches);
-		}
+			zswap_shp = list_entry(zswap_shp->zero_list.next, 
+						struct shmid_kernel, zero_list);
+		list_del(&shp->zero_list);
 		spin_unlock(&zmap_list_lock);
 		killseg_core(shp, 0);
 	}
 }
 
+static struct page * shmzero_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
+{
+	struct page *page;
+	struct shmid_kernel *shp;
+	unsigned int idx;
+	int dummy;
+
+	idx = (address - shmd->vm_start) >> PAGE_SHIFT;
+	idx += shmd->vm_pgoff;
+
+	shp = VMA_TO_SHP(shmd);
+	down(&shp->zsem);
+	shm_lock(zero_id);
+	page = shm_nopage_core(shp, idx, &dummy, &zshm_rss);
+	shm_unlock(zero_id);
+	up(&shp->zsem);
+	return(page);
+}
+
 static void zmap_unuse(swp_entry_t entry, struct page *page)
 {
 	struct shmid_kernel *shp;
 
 	spin_lock(&zmap_list_lock);
-	shp = zmap_list;
-	while (shp) {
+	shm_lock(zero_id);
+	for (shp = list_entry(zshmid_kernel.zero_list.next, struct shmid_kernel, 
+			zero_list); shp != &zshmid_kernel;
+			shp = list_entry(shp->zero_list.next, struct shmid_kernel,
+								zero_list)) {
 		if (shm_unuse_core(shp, entry, page))
 			break;
-		shp = (struct shmid_kernel *)shp->attaches;
 	}
+	shm_unlock(zero_id);
 	spin_unlock(&zmap_list_lock);
 }
 
@@ -1275,7 +1677,7 @@
 	int counter;
 	struct page * page_map;
 
-	counter = 10;	/* maybe we should use zshm_rss */
+	counter = zshm_rss >> prio;
 	if (!counter)
 		return;
 next:
@@ -1283,25 +1685,30 @@
 		return;
 
 	spin_lock(&zmap_list_lock);
-	if (zmap_list == 0)
+	shm_lock(zero_id);
+	if (zshmid_kernel.zero_list.next == 0)
 		goto failed;
 next_id:
-	if ((shp = zswap_shp) == 0) {
+	if (zswap_shp == &zshmid_kernel) {
 		if (loop) {
 failed:
+			shm_unlock(zero_id);
 			spin_unlock(&zmap_list_lock);
 			__swap_free(swap_entry, 2);
 			return;
 		}
-		zswap_shp = shp = zmap_list;
+		zswap_shp = list_entry(zshmid_kernel.zero_list.next, 
+					struct shmid_kernel, zero_list);
 		zswap_idx = 0;
 		loop = 1;
 	}
+	shp = zswap_shp;
 
 check_table:
 	idx = zswap_idx++;
 	if (idx >= shp->shm_npages) {
-		zswap_shp = (struct shmid_kernel *)(zswap_shp->attaches);
+		zswap_shp = list_entry(zswap_shp->zero_list.next, 
+					struct shmid_kernel, zero_list);
 		zswap_idx = 0;
 		goto next_id;
 	}
@@ -1310,6 +1717,7 @@
 		case RETRY: goto check_table;
 		case FAILED: goto failed;
 	}
+	shm_unlock(zero_id);
 	spin_unlock(&zmap_list_lock);
 
 	shm_swap_postop(page_map);
@@ -1317,3 +1725,4 @@
 		goto next;
 	return;
 }
+

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)