patch-2.4.1 linux/mm/vmscan.c

Next file: linux/net/appletalk/aarp.c
Previous file: linux/mm/vmalloc.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.0/linux/mm/vmscan.c linux/mm/vmscan.c
@@ -35,45 +35,21 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
 {
 	pte_t pte;
 	swp_entry_t entry;
-	struct page * page;
-	int onlist;
-
-	pte = *page_table;
-	if (!pte_present(pte))
-		goto out_failed;
-	page = pte_page(pte);
-	if ((!VALID_PAGE(page)) || PageReserved(page))
-		goto out_failed;
-
-	if (!mm->swap_cnt)
-		return 1;
-
-	mm->swap_cnt--;
 
-	onlist = PageActive(page);
 	/* Don't look at this pte if it's been accessed recently. */
 	if (ptep_test_and_clear_young(page_table)) {
-		age_page_up(page);
-		goto out_failed;
+		page->age += PAGE_AGE_ADV;
+		if (page->age > PAGE_AGE_MAX)
+			page->age = PAGE_AGE_MAX;
+		return;
 	}
-	if (!onlist)
-		/* The page is still mapped, so it can't be freeable... */
-		age_page_down_ageonly(page);
-
-	/*
-	 * If the page is in active use by us, or if the page
-	 * is in active use by others, don't unmap it or
-	 * (worse) start unneeded IO.
-	 */
-	if (page->age > 0)
-		goto out_failed;
 
 	if (TryLockPage(page))
-		goto out_failed;
+		return;
 
 	/* From this point on, the odds are that we're going to
 	 * nuke this pte, so read and clear the pte.  This hook
@@ -87,9 +63,6 @@
 	 * Is the page already in the swap cache? If so, then
 	 * we can just drop our reference to it without doing
 	 * any IO - it's already up-to-date on disk.
-	 *
-	 * Return 0, as we didn't actually free any real
-	 * memory, and we should just continue our scan.
 	 */
 	if (PageSwapCache(page)) {
 		entry.val = page->index;
@@ -99,12 +72,12 @@
 		swap_duplicate(entry);
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
-		UnlockPage(page);
 		mm->rss--;
-		deactivate_page(page);
+		if (!page->age)
+			deactivate_page(page);
+		UnlockPage(page);
 		page_cache_release(page);
-out_failed:
-		return 0;
+		return;
 	}
 
 	/*
@@ -153,34 +126,20 @@
 out_unlock_restore:
 	set_pte(page_table, pte);
 	UnlockPage(page);
-	return 0;
+	return;
 }
 
-/*
- * A new implementation of swap_out().  We do not swap complete processes,
- * but only a small number of blocks, before we continue with the next
- * process.  The number of blocks actually swapped is determined on the
- * number of page faults, that this process actually had in the last time,
- * so we won't swap heavily used processes all the time ...
- *
- * Note: the priority argument is a hint on much CPU to waste with the
- *       swap block search, not a hint, of how much blocks to swap with
- *       each process.
- *
- * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
- */
-
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
 
 	if (pmd_none(*dir))
-		return 0;
+		return count;
 	if (pmd_bad(*dir)) {
 		pmd_ERROR(*dir);
 		pmd_clear(dir);
-		return 0;
+		return count;
 	}
 	
 	pte = pte_offset(dir, address);
@@ -190,28 +149,33 @@
 		end = pmd_end;
 
 	do {
-		int result;
-		mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
-		if (result)
-			return result;
+		if (pte_present(*pte)) {
+			struct page *page = pte_page(*pte);
+
+			if (VALID_PAGE(page) && !PageReserved(page)) {
+				try_to_swap_out(mm, vma, address, pte, page);
+				if (!--count)
+					break;
+			}
+		}
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
-	return 0;
+	mm->swap_address = address + PAGE_SIZE;
+	return count;
 }
 
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
 
 	if (pgd_none(*dir))
-		return 0;
+		return count;
 	if (pgd_bad(*dir)) {
 		pgd_ERROR(*dir);
 		pgd_clear(dir);
-		return 0;
+		return count;
 	}
 
 	pmd = pmd_offset(dir, address);
@@ -221,23 +185,23 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
-		if (result)
-			return result;
+		count = swap_out_pmd(mm, vma, pmd, address, end, count);
+		if (!count)
+			break;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
-	return 0;
+	return count;
 }
 
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
 {
 	pgd_t *pgdir;
 	unsigned long end;
 
 	/* Don't swap out areas which are locked down */
 	if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
-		return 0;
+		return count;
 
 	pgdir = pgd_offset(mm, address);
 
@@ -245,18 +209,17 @@
 	if (address >= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
-		if (result)
-			return result;
+		count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+		if (!count)
+			break;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (address && (address < end));
-	return 0;
+	return count;
 }
 
-static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
+static int swap_out_mm(struct mm_struct * mm, int count)
 {
-	int result = 0;
 	unsigned long address;
 	struct vm_area_struct* vma;
 
@@ -276,8 +239,8 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			result = swap_out_vma(mm, vma, address, gfp_mask);
-			if (result)
+			count = swap_out_vma(mm, vma, address, count);
+			if (!count)
 				goto out_unlock;
 			vma = vma->vm_next;
 			if (!vma)
@@ -287,94 +250,63 @@
 	}
 	/* Reset to 0 when we reach the end of address space */
 	mm->swap_address = 0;
-	mm->swap_cnt = 0;
 
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
-	return result;
+	return !count;
 }
 
 /*
- * Select the task with maximal swap_cnt and try to swap out a page.
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
 #define SWAP_SHIFT 5
 #define SWAP_MIN 8
 
+static inline int swap_amount(struct mm_struct *mm)
+{
+	int nr = mm->rss >> SWAP_SHIFT;
+	return nr < SWAP_MIN ? SWAP_MIN : nr;
+}
+
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	int counter;
-	int __ret = 0;
+	int retval = 0;
+	struct mm_struct *mm = current->mm;
 
-	/* 
-	 * We make one or two passes through the task list, indexed by 
-	 * assign = {0, 1}:
-	 *   Pass 1: select the swappable task with maximal RSS that has
-	 *         not yet been swapped out. 
-	 *   Pass 2: re-assign rss swap_cnt values, then select as above.
-	 *
-	 * With this approach, there's no need to remember the last task
-	 * swapped out.  If the swap-out fails, we clear swap_cnt so the 
-	 * task won't be selected again until all others have been tried.
-	 *
-	 * Think of swap_cnt as a "shadow rss" - it tells us which process
-	 * we want to page out (always try largest first).
-	 */
-	counter = (nr_threads << SWAP_SHIFT) >> priority;
-	if (counter < 1)
-		counter = 1;
+	/* Always start by trying to penalize the process that is allocating memory */
+	if (mm)
+		retval = swap_out_mm(mm, swap_amount(mm));
 
-	for (; counter >= 0; counter--) {
+	/* Then, look at the other mm's */
+	counter = mmlist_nr >> priority;
+	do {
 		struct list_head *p;
-		unsigned long max_cnt = 0;
-		struct mm_struct *best = NULL;
-		int assign = 0;
-		int found_task = 0;
-	select:
+
 		spin_lock(&mmlist_lock);
 		p = init_mm.mmlist.next;
-		for (; p != &init_mm.mmlist; p = p->next) {
-			struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);
-	 		if (mm->rss <= 0)
-				continue;
-			found_task++;
-			/* Refresh swap_cnt? */
-			if (assign == 1) {
-				mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
-				if (mm->swap_cnt < SWAP_MIN)
-					mm->swap_cnt = SWAP_MIN;
-			}
-			if (mm->swap_cnt > max_cnt) {
-				max_cnt = mm->swap_cnt;
-				best = mm;
-			}
-		}
+		if (p == &init_mm.mmlist)
+			goto empty;
+
+		/* Move it to the back of the queue.. */
+		list_del(p);
+		list_add_tail(p, &init_mm.mmlist);
+		mm = list_entry(p, struct mm_struct, mmlist);
 
-		/* Make sure it doesn't disappear */
-		if (best)
-			atomic_inc(&best->mm_users);
+		/* Make sure the mm doesn't disappear when we drop the lock.. */
+		atomic_inc(&mm->mm_users);
 		spin_unlock(&mmlist_lock);
 
-		/*
-		 * We have dropped the tasklist_lock, but we
-		 * know that "mm" still exists: we are running
-		 * with the big kernel lock, and exit_mm()
-		 * cannot race with us.
-		 */
-		if (!best) {
-			if (!assign && found_task > 0) {
-				assign = 1;
-				goto select;
-			}
-			break;
-		} else {
-			__ret = swap_out_mm(best, gfp_mask);
-			mmput(best);
-			break;
-		}
-	}
-	return __ret;
+		/* Walk about 6% of the address space each time */
+		retval |= swap_out_mm(mm, swap_amount(mm));
+		mmput(mm);
+	} while (--counter >= 0);
+	return retval;
+
+empty:
+	spin_unlock(&mmlist_lock);
+	return 0;
 }
 
 
@@ -540,7 +472,6 @@
 		 */
 		if (PageDirty(page)) {
 			int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
-			int result;
 
 			if (!writepage)
 				goto page_active;
@@ -558,16 +489,12 @@
 			page_cache_get(page);
 			spin_unlock(&pagemap_lru_lock);
 
-			result = writepage(page);
+			writepage(page);
 			page_cache_release(page);
 
 			/* And re-start the thing.. */
 			spin_lock(&pagemap_lru_lock);
-			if (result != 1)
-				continue;
-			/* writepage refused to do anything */
-			set_page_dirty(page);
-			goto page_active;
+			continue;
 		}
 
 		/*
@@ -808,6 +735,9 @@
 int inactive_shortage(void)
 {
 	int shortage = 0;
+	pg_data_t *pgdat = pgdat_list;
+
+	/* Is the inactive dirty list too small? */
 
 	shortage += freepages.high;
 	shortage += inactive_target;
@@ -818,7 +748,27 @@
 	if (shortage > 0)
 		return shortage;
 
-	return 0;
+	/* If not, do we have enough per-zone pages on the inactive list? */
+
+	shortage = 0;
+
+	do {
+		int i;
+		for(i = 0; i < MAX_NR_ZONES; i++) {
+			int zone_shortage;
+			zone_t *zone = pgdat->node_zones+ i;
+
+			zone_shortage = zone->pages_high;
+			zone_shortage -= zone->inactive_dirty_pages;
+			zone_shortage -= zone->inactive_clean_pages;
+			zone_shortage -= zone->free_pages;
+			if (zone_shortage > 0)
+				shortage += zone_shortage;
+		}
+		pgdat = pgdat->node_next;
+	} while (pgdat);
+
+	return shortage;
 }
 
 /*
@@ -833,72 +783,35 @@
  * really care about latency. In that case we don't try
  * to free too many pages.
  */
+#define DEF_PRIORITY (6)
 static int refill_inactive(unsigned int gfp_mask, int user)
 {
-	int priority, count, start_count, made_progress;
+	int count, start_count, maxtry;
 
 	count = inactive_shortage() + free_shortage();
 	if (user)
 		count = (1 << page_cluster);
 	start_count = count;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	priority = 6;
+	maxtry = 6;
 	do {
-		made_progress = 0;
-
 		if (current->need_resched) {
 			__set_current_state(TASK_RUNNING);
 			schedule();
 		}
 
-		while (refill_inactive_scan(priority, 1)) {
-			made_progress = 1;
+		while (refill_inactive_scan(DEF_PRIORITY, 1)) {
 			if (--count <= 0)
 				goto done;
 		}
 
-		/*
-		 * don't be too light against the d/i cache since
-	   	 * refill_inactive() almost never fail when there's
-	   	 * really plenty of memory free. 
-		 */
-		shrink_dcache_memory(priority, gfp_mask);
-		shrink_icache_memory(priority, gfp_mask);
+		/* If refill_inactive_scan failed, try to page stuff out.. */
+		swap_out(DEF_PRIORITY, gfp_mask);
 
-		/*
-		 * Then, try to page stuff out..
-		 */
-		while (swap_out(priority, gfp_mask)) {
-			made_progress = 1;
-			if (--count <= 0)
-				goto done;
-		}
-
-		/*
-		 * If we either have enough free memory, or if
-		 * page_launder() will be able to make enough
-		 * free memory, then stop.
-		 */
-		if (!inactive_shortage() || !free_shortage())
-			goto done;
-
-		/*
-		 * Only switch to a lower "priority" if we
-		 * didn't make any useful progress in the
-		 * last loop.
-		 */
-		if (!made_progress)
-			priority--;
-	} while (priority >= 0);
-
-	/* Always end on a refill_inactive.., may sleep... */
-	while (refill_inactive_scan(0, 1)) {
-		if (--count <= 0)
-			goto done;
-	}
+		if (--maxtry <= 0)
+				return 0;
+		
+	} while (inactive_shortage());
 
 done:
 	return (count < start_count);
@@ -922,20 +835,29 @@
 
 	/*
 	 * If needed, we move pages from the active list
-	 * to the inactive list. We also "eat" pages from
-	 * the inode and dentry cache whenever we do this.
+	 * to the inactive list.
 	 */
-	if (free_shortage() || inactive_shortage()) {
-		shrink_dcache_memory(6, gfp_mask);
-		shrink_icache_memory(6, gfp_mask);
+	if (inactive_shortage())
 		ret += refill_inactive(gfp_mask, user);
+
+	/* 	
+	 * Delete pages from the inode and dentry caches and 
+	 * reclaim unused slab cache if memory is low.
+	 */
+	if (free_shortage()) {
+		shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+		shrink_icache_memory(DEF_PRIORITY, gfp_mask);
 	} else {
 		/*
-		 * Reclaim unused slab cache memory.
+		 * Illogical, but true. At least for now.
+		 *
+		 * If we're _not_ under shortage any more, we
+		 * reap the caches. Why? Because a noticeable
+		 * part of the caches are the buffer-heads, 
+		 * which we'll want to keep if under shortage.
 		 */
 		kmem_cache_reap(gfp_mask);
-		ret = 1;
-	}
+	} 
 
 	return ret;
 }
@@ -988,13 +910,8 @@
 		static int recalc = 0;
 
 		/* If needed, try to free some memory. */
-		if (inactive_shortage() || free_shortage()) {
-			int wait = 0;
-			/* Do we need to do some synchronous flushing? */
-			if (waitqueue_active(&kswapd_done))
-				wait = 1;
-			do_try_to_free_pages(GFP_KSWAPD, wait);
-		}
+		if (inactive_shortage() || free_shortage()) 
+			do_try_to_free_pages(GFP_KSWAPD, 0);
 
 		/*
 		 * Do some (very minimal) background scanning. This
@@ -1002,7 +919,7 @@
 		 * every minute. This clears old referenced bits
 		 * and moves unused pages to the inactive list.
 		 */
-		refill_inactive_scan(6, 0);
+		refill_inactive_scan(DEF_PRIORITY, 0);
 
 		/* Once a second, recalculate some VM stats. */
 		if (time_after(jiffies, recalc + HZ)) {
@@ -1010,11 +927,6 @@
 			recalculate_vm_stats();
 		}
 
-		/*
-		 * Wake up everybody waiting for free memory
-		 * and unplug the disk queue.
-		 */
-		wake_up_all(&kswapd_done);
 		run_task_queue(&tq_disk);
 
 		/* 
@@ -1045,33 +957,10 @@
 	}
 }
 
-void wakeup_kswapd(int block)
+void wakeup_kswapd(void)
 {
-	DECLARE_WAITQUEUE(wait, current);
-
-	if (current == kswapd_task)
-		return;
-
-	if (!block) {
-		if (waitqueue_active(&kswapd_wait))
-			wake_up(&kswapd_wait);
-		return;
-	}
-
-	/*
-	 * Kswapd could wake us up before we get a chance
-	 * to sleep, so we have to be very careful here to
-	 * prevent SMP races...
-	 */
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	add_wait_queue(&kswapd_done, &wait);
-
-	if (waitqueue_active(&kswapd_wait))
-		wake_up(&kswapd_wait);
-	schedule();
-
-	remove_wait_queue(&kswapd_done, &wait);
-	__set_current_state(TASK_RUNNING);
+	if (current != kswapd_task)
+		wake_up_process(kswapd_task);
 }
 
 /*
@@ -1096,7 +985,7 @@
 /*
  * Kreclaimd will move pages from the inactive_clean list to the
  * free list, in order to keep atomic allocations possible under
- * all circumstances. Even when kswapd is blocked on IO.
+ * all circumstances.
  */
 int kreclaimd(void *unused)
 {

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)