patch-2.4.23 linux-2.4.23/mm/page_alloc.c

Next file: linux-2.4.23/mm/shmem.c
Previous file: linux-2.4.23/mm/oom_kill.c
Back to the patch index
Back to the overall index

diff -urN linux-2.4.22/mm/page_alloc.c linux-2.4.23/mm/page_alloc.c
@@ -42,6 +42,9 @@
 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+
+int vm_gfp_debug = 0;
 
 /*
  * Temporary debugging check.
@@ -106,7 +109,8 @@
 		BUG();
 	if (PageActive(page))
 		BUG();
-	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
+	ClearPageReferenced(page);
+	ClearPageDirty(page);
 
 	if (current->flags & PF_FREE_PAGES)
 		goto local_freelist;
@@ -253,10 +257,8 @@
 static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
 {
 	struct page * page = NULL;
-	int __freed = 0;
+	int __freed;
 
-	if (!(gfp_mask & __GFP_WAIT))
-		goto out;
 	if (in_interrupt())
 		BUG();
 
@@ -316,33 +318,36 @@
 		}
 		current->nr_local_pages = 0;
 	}
- out:
+
 	*freed = __freed;
 	return page;
 }
 
+static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
+{
+	long free = zone->free_pages - (1UL << order);
+	return free >= 0 ? free : 0;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 {
-	unsigned long min;
 	zone_t **zone, * classzone;
 	struct page * page;
-	int freed;
+	int freed, class_idx;
 
 	zone = zonelist->zones;
 	classzone = *zone;
-	if (classzone == NULL)
-		return NULL;
-	min = 1UL << order;
+	class_idx = zone_idx(classzone);
+
 	for (;;) {
 		zone_t *z = *(zone++);
 		if (!z)
 			break;
 
-		min += z->pages_low;
-		if (z->free_pages > min) {
+		if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
@@ -355,18 +360,16 @@
 		wake_up_interruptible(&kswapd_wait);
 
 	zone = zonelist->zones;
-	min = 1UL << order;
 	for (;;) {
-		unsigned long local_min;
+		unsigned long min;
 		zone_t *z = *(zone++);
 		if (!z)
 			break;
 
-		local_min = z->pages_min;
+		min = z->watermarks[class_idx].min;
 		if (!(gfp_mask & __GFP_WAIT))
-			local_min >>= 2;
-		min += local_min;
-		if (z->free_pages > min) {
+			min >>= 2;
+		if (zone_free_pages(z, order) > min) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
@@ -375,8 +378,7 @@
 
 	/* here we're in the low on memory slow path */
 
-rebalance:
-	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
+	if (current->flags & PF_MEMALLOC && !in_interrupt()) {
 		zone = zonelist->zones;
 		for (;;) {
 			zone_t *z = *(zone++);
@@ -392,34 +394,51 @@
 
 	/* Atomic allocations - we can't balance anything */
 	if (!(gfp_mask & __GFP_WAIT))
-		return NULL;
+		goto out;
 
+ rebalance:
 	page = balance_classzone(classzone, gfp_mask, order, &freed);
 	if (page)
 		return page;
 
 	zone = zonelist->zones;
-	min = 1UL << order;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
+	if (likely(freed)) {
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-		min += z->pages_min;
-		if (z->free_pages > min) {
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+			if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
 		}
-	}
+		goto rebalance;
+	} else {
+		/* 
+		 * Check that no other task is been killed meanwhile,
+		 * in such a case we can succeed the allocation.
+		 */
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-	/* Don't let big-order allocations loop */
-	if (order > 3)
-		return NULL;
+			if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
+		}
+	}
 
-	/* Yield for kswapd, and try again */
-	yield();
-	goto rebalance;
+ out:
+	printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
+	       order, gfp_mask, !!(current->flags & PF_MEMALLOC));
+	if (unlikely(vm_gfp_debug))
+		dump_stack();
+	return NULL;
 }
 
 /*
@@ -481,17 +500,22 @@
 {
 	pg_data_t *pgdat;
 	unsigned int sum = 0;
+	zonelist_t *zonelist;
+	zone_t **zonep, *zone;
 
 	for_each_pgdat(pgdat) {
-		zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
-		zone_t **zonep = zonelist->zones;
-		zone_t *zone;
-
-		for (zone = *zonep++; zone; zone = *zonep++) {
-			unsigned long size = zone->size;
-			unsigned long high = zone->pages_high;
-			if (size > high)
-				sum += size - high;
+		int class_idx;
+		zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
+		zonep = zonelist->zones;
+		zone = *zonep;
+		class_idx = zone_idx(zone);
+
+		sum += zone->nr_cache_pages;
+		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+			int free = zone->free_pages - zone->watermarks[class_idx].high;
+			if (free <= 0)
+				continue;
+			sum += free;
 		}
 	}
 
@@ -532,13 +556,9 @@
 		zone_t *zone;
 		for (zone = tmpdat->node_zones;
 			       	zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
-			printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB " 
-				       "high:%6lukB\n", 
+			printk("Zone:%s freepages:%6lukB\n", 
 					zone->name,
-					K(zone->free_pages),
-					K(zone->pages_min),
-					K(zone->pages_low),
-					K(zone->pages_high));
+					K(zone->free_pages));
 			
 		tmpdat = tmpdat->node_next;
 	}
@@ -729,6 +749,7 @@
 		zone_t *zone = pgdat->node_zones + j;
 		unsigned long mask;
 		unsigned long size, realsize;
+		int idx;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
 		realsize = size = zones_size[j];
@@ -737,11 +758,15 @@
 
 		printk("zone(%lu): %lu pages.\n", j, size);
 		zone->size = size;
+		zone->realsize = realsize;
 		zone->name = zone_names[j];
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 		zone->need_balance = 0;
+		 zone->nr_active_pages = zone->nr_inactive_pages = 0;
+
+
 		if (!size)
 			continue;
 
@@ -766,9 +791,29 @@
 			mask = zone_balance_min[j];
 		else if (mask > zone_balance_max[j])
 			mask = zone_balance_max[j];
-		zone->pages_min = mask;
-		zone->pages_low = mask*2;
-		zone->pages_high = mask*3;
+		zone->watermarks[j].min = mask;
+		zone->watermarks[j].low = mask*2;
+		zone->watermarks[j].high = mask*3;
+		/* now set the watermarks of the lower zones in the "j" classzone */
+		for (idx = j-1; idx >= 0; idx--) {
+			zone_t * lower_zone = pgdat->node_zones + idx;
+			unsigned long lower_zone_reserve;
+			if (!lower_zone->size)
+				continue;
+
+			mask = lower_zone->watermarks[idx].min;
+			lower_zone->watermarks[j].min = mask;
+			lower_zone->watermarks[j].low = mask*2;
+			lower_zone->watermarks[j].high = mask*3;
+
+			/* now the brainer part */
+			lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
+			lower_zone->watermarks[j].min += lower_zone_reserve;
+			lower_zone->watermarks[j].low += lower_zone_reserve;
+			lower_zone->watermarks[j].high += lower_zone_reserve;
+
+			realsize += lower_zone->realsize;
+		}
 
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
@@ -852,3 +897,16 @@
 }
 
 __setup("memfrac=", setup_mem_frac);
+
+static int __init setup_lower_zone_reserve(char *str)
+{
+	int j = 0;
+
+	while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
+	printk("setup_lower_zone_reserve: ");
+	for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d  ", lower_zone_reserve_ratio[j]);
+	printk("\n");
+	return 1;
+}
+
+__setup("lower_zone_reserve=", setup_lower_zone_reserve);

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)