patch-2.3.29 linux/mm/page_alloc.c
Next file: linux/mm/slab.c
Previous file: linux/mm/mremap.c
Back to the patch index
Back to the overall index
- Lines: 699
- Date:
Tue Nov 23 10:20:58 1999
- Orig file:
v2.3.28/linux/mm/page_alloc.c
- Orig date:
Thu Nov 11 20:11:55 1999
diff -u --recursive --new-file v2.3.28/linux/mm/page_alloc.c linux/mm/page_alloc.c
@@ -9,74 +9,25 @@
#include <linux/config.h>
#include <linux/mm.h>
-#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/interrupt.h>
-#include <linux/init.h>
#include <linux/pagemap.h>
-#include <linux/highmem.h>
#include <linux/bootmem.h>
-#include <asm/dma.h>
-#include <asm/uaccess.h> /* for copy_to/from_user */
-#include <asm/pgtable.h>
-
int nr_swap_pages = 0;
int nr_lru_pages;
LIST_HEAD(lru_cache);
-/*
- * Free area management
- *
- * The free_area_list arrays point to the queue heads of the free areas
- * of different sizes
- */
-
-#if CONFIG_AP1000
-/* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
- for the ring buffers */
-#define MAX_ORDER 12
-#else
-#define MAX_ORDER 10
-#endif
-
-typedef struct free_area_struct {
- struct list_head free_list;
- unsigned int * map;
-} free_area_t;
-
-#define ZONE_DMA 0
-#define ZONE_NORMAL 1
-
-#ifdef CONFIG_HIGHMEM
-# define ZONE_HIGHMEM 2
-# define NR_ZONES 3
-#else
-# define NR_ZONES 2
-#endif
-
-typedef struct zone_struct {
- spinlock_t lock;
- unsigned long offset;
- unsigned long size;
- free_area_t free_area[MAX_ORDER];
-
- unsigned long free_pages;
- unsigned long pages_low, pages_high;
- int low_on_memory;
- char * name;
-} zone_t;
-
-static zone_t zones[NR_ZONES] =
+static zone_t zones [MAX_NR_ZONES] =
{
{ name: "DMA" },
{ name: "Normal" },
-#ifdef CONFIG_HIGHMEM
{ name: "HighMem" }
-#endif
};
+zonelist_t zonelists [NR_GFPINDEX];
+
/*
* Free_page() adds the page to the free lists. This is optimized for
* fast normal cases (no error jumps taken normally).
@@ -100,7 +51,7 @@
/*
* Temporary debugging check.
*/
-#define BAD_RANGE(zone,x) ((((x)-mem_map) < zone->offset) || (((x)-mem_map) >= zone->offset+zone->size))
+#define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
/*
* Buddy system. Hairy. You really aren't expected to understand this
@@ -108,43 +59,50 @@
* Hint: -mask = 1+~mask
*/
-static inline void free_pages_ok (struct page *page, unsigned long map_nr, unsigned long order)
+void __free_pages_ok (struct page *page, unsigned long order)
{
- struct free_area_struct *area;
- unsigned long index, page_idx, mask, offset;
- unsigned long flags;
- struct page *buddy;
+ unsigned long index, page_idx, mask, flags;
+ free_area_t *area;
+ struct page *base;
zone_t *zone;
- int i;
/*
- * Which zone is this page belonging to.
- *
- * (NR_ZONES is low, and we do not want (yet) to introduce
- * put page->zone, it increases the size of mem_map[]
- * unnecesserily. This small loop is basically equivalent
- * to the previous #ifdef jungle, speed-wise.)
+ * Subtle. We do not want to test this in the inlined part of
+ * __free_page() - it's a rare condition and just increases
+ * cache footprint unnecesserily. So we do an 'incorrect'
+ * decrement on page->count for reserved pages, but this part
+ * makes it safe.
*/
- i = NR_ZONES-1;
- zone = zones + i;
- for ( ; i >= 0; i--, zone--)
- if (map_nr >= zone->offset)
- break;
+ if (PageReserved(page))
+ return;
+
+ if (page-mem_map >= max_mapnr)
+ BUG();
+ if (PageSwapCache(page))
+ BUG();
+ if (PageLocked(page))
+ BUG();
+
+ zone = page->zone;
mask = (~0UL) << order;
- offset = zone->offset;
- area = zone->free_area;
- area += order;
- page_idx = map_nr - zone->offset;
- page_idx &= mask;
+ base = mem_map + zone->offset;
+ page_idx = page - base;
+ if (page_idx & ~mask)
+ BUG();
index = page_idx >> (1 + order);
- mask = (~0UL) << order;
+
+ area = zone->free_area + order;
spin_lock_irqsave(&zone->lock, flags);
zone->free_pages -= mask;
while (mask + (1 << (MAX_ORDER-1))) {
+ struct page *buddy1, *buddy2;
+
+ if (area >= zone->free_area + MAX_ORDER)
+ BUG();
if (!test_and_change_bit(index, area->map))
/*
* the buddy page is still allocated.
@@ -153,87 +111,52 @@
/*
* Move the buddy up one level.
*/
- buddy = mem_map + offset + (page_idx ^ -mask);
- page = mem_map + offset + page_idx;
- if (BAD_RANGE(zone,buddy))
+ buddy1 = base + (page_idx ^ -mask);
+ buddy2 = base + page_idx;
+ if (BAD_RANGE(zone,buddy1))
BUG();
- if (BAD_RANGE(zone,page))
+ if (BAD_RANGE(zone,buddy2))
BUG();
- memlist_del(&buddy->list);
+ memlist_del(&buddy1->list);
mask <<= 1;
area++;
index >>= 1;
page_idx &= mask;
}
- memlist_add_head(&mem_map[offset + page_idx].list, &area->free_list);
+ memlist_add_head(&(base + page_idx)->list, &area->free_list);
spin_unlock_irqrestore(&zone->lock, flags);
}
-/*
- * Some ugly macros to speed up __get_free_pages()..
- */
#define MARK_USED(index, order, area) \
change_bit((index) >> (1+(order)), (area)->map)
-#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
-
-int __free_page (struct page *page)
-{
- if (!PageReserved(page) && put_page_testzero(page)) {
- if (PageSwapCache(page))
- PAGE_BUG(page);
- if (PageLocked(page))
- PAGE_BUG(page);
-
- free_pages_ok(page, page-mem_map, 0);
- return 1;
- }
- return 0;
-}
-int free_pages (unsigned long addr, unsigned long order)
-{
- unsigned long map_nr = MAP_NR(addr);
-
- if (map_nr < max_mapnr) {
- mem_map_t * map = mem_map + map_nr;
- if (!PageReserved(map) && put_page_testzero(map)) {
- if (PageSwapCache(map))
- PAGE_BUG(map);
- if (PageLocked(map))
- PAGE_BUG(map);
- free_pages_ok(map, map_nr, order);
- return 1;
- }
- }
- return 0;
-}
-
-static inline unsigned long EXPAND (zone_t *zone, struct page *map, unsigned long index,
- int low, int high, struct free_area_struct * area)
+static inline struct page * expand (zone_t *zone, struct page *page,
+ unsigned long index, int low, int high, free_area_t * area)
{
unsigned long size = 1 << high;
while (high > low) {
- if (BAD_RANGE(zone,map))
+ if (BAD_RANGE(zone,page))
BUG();
area--;
high--;
size >>= 1;
- memlist_add_head(&(map)->list, &(area)->free_list);
+ memlist_add_head(&(page)->list, &(area)->free_list);
MARK_USED(index, high, area);
index += size;
- map += size;
+ page += size;
}
- set_page_count(map, 1);
- return index;
+ if (BAD_RANGE(zone,page))
+ BUG();
+ return page;
}
static inline struct page * rmqueue (zone_t *zone, unsigned long order)
{
- struct free_area_struct * area = zone->free_area + order;
- unsigned long curr_order = order, map_nr;
+ free_area_t * area = zone->free_area + order;
+ unsigned long curr_order = order;
struct list_head *head, *curr;
unsigned long flags;
struct page *page;
@@ -247,15 +170,17 @@
unsigned int index;
page = memlist_entry(curr, struct page, list);
+ if (BAD_RANGE(zone,page))
+ BUG();
memlist_del(curr);
- map_nr = page - mem_map;
- index = map_nr - zone->offset;
+ index = (page - mem_map) - zone->offset;
MARK_USED(index, curr_order, area);
zone->free_pages -= 1 << order;
- map_nr = zone->offset + EXPAND(zone, page, index, order, curr_order, area);
+
+ page = expand(zone, page, index, order, curr_order, area);
spin_unlock_irqrestore(&zone->lock, flags);
- page = mem_map + map_nr;
+ set_page_count(page, 1);
if (BAD_RANGE(zone,page))
BUG();
return page;
@@ -268,11 +193,14 @@
return NULL;
}
-static inline int balance_memory (zone_t *zone, int gfp_mask)
+#define ZONE_BALANCED(zone) \
+ (((zone)->free_pages > (zone)->pages_low) && (!(zone)->low_on_memory))
+
+static inline int zone_balance_memory (zone_t *zone, int gfp_mask)
{
int freed;
- if (zone->free_pages > zone->pages_low) {
+ if (zone->free_pages >= zone->pages_low) {
if (!zone->low_on_memory)
return 1;
/*
@@ -283,8 +211,16 @@
zone->low_on_memory = 0;
return 1;
}
- }
- zone->low_on_memory = 1;
+ } else
+ zone->low_on_memory = 1;
+
+ /*
+ * In the atomic allocation case we only 'kick' the
+ * state machine, but do not try to free pages
+ * ourselves.
+ */
+ if (!(gfp_mask & __GFP_WAIT))
+ return 1;
current->flags |= PF_MEMALLOC;
freed = try_to_free_pages(gfp_mask);
@@ -295,39 +231,96 @@
return 1;
}
-static inline struct page * __get_pages (zone_t *zone, unsigned int gfp_mask,
- unsigned long order)
+/*
+ * We are still balancing memory in a global way:
+ */
+static inline int balance_memory (int gfp_mask)
{
- struct page *page;
+ unsigned long free = nr_free_pages();
+ static int low_on_memory = 0;
+ int freed;
- if (order >= MAX_ORDER)
- goto nopage;
+ if (free >= freepages.low) {
+ if (!low_on_memory)
+ return 1;
+ /*
+ * Simple hysteresis: exit 'low memory mode' if
+ * the upper limit has been reached:
+ */
+ if (free >= freepages.high) {
+ low_on_memory = 0;
+ return 1;
+ }
+ } else
+ low_on_memory = 1;
/*
- * If anyone calls gfp from interrupts nonatomically then it
- * will sooner or later tripped up by a schedule().
+ * In the atomic allocation case we only 'kick' the
+ * state machine, but do not try to free pages
+ * ourselves.
*/
+ if (!(gfp_mask & __GFP_WAIT))
+ return 1;
+
+ current->flags |= PF_MEMALLOC;
+ freed = try_to_free_pages(gfp_mask);
+ current->flags &= ~PF_MEMALLOC;
+
+ if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+ return 0;
+ return 1;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator:
+ */
+struct page * __alloc_pages (zonelist_t *zonelist, unsigned long order)
+{
+ zone_t **zone, *z;
+ struct page *page;
+ int gfp_mask;
/*
- * If this is a recursive call, we'd better
- * do our best to just allocate things without
- * further thought.
- */
- if (!(current->flags & PF_MEMALLOC))
- if (!balance_memory(zone, gfp_mask))
- goto nopage;
- /*
+ * (If anyone calls gfp from interrupts nonatomically then it
+ * will sooner or later tripped up by a schedule().)
+ *
* We are falling back to lower-level zones if allocation
- * in a higher zone fails. This assumes a hierarchical
- * dependency between zones, which is true currently. If
- * you need something else then move this loop outside
- * this function, into the zone-specific allocator.
+ * in a higher zone fails.
*/
- do {
- page = rmqueue(zone, order);
- if (page)
- return page;
- } while (zone-- != zones) ;
+ zone = zonelist->zones;
+ gfp_mask = zonelist->gfp_mask;
+ for (;;) {
+ z = *(zone++);
+ if (!z)
+ break;
+ if (!z->size)
+ BUG();
+ /*
+ * If this is a recursive call, we'd better
+ * do our best to just allocate things without
+ * further thought.
+ */
+ if (!(current->flags & PF_MEMALLOC))
+ /*
+ * fastpath
+ */
+ if (!ZONE_BALANCED(z))
+ goto balance;
+ /*
+ * This is an optimization for the 'higher order zone
+ * is empty' case - it can happen even in well-behaved
+ * systems, think the page-cache filling up all RAM.
+ * We skip over empty zones. (this is not exact because
+ * we do not take the spinlock and it's not exact for
+ * the higher order case, but will do it for most things.)
+ */
+ready:
+ if (z->free_pages) {
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
/*
* If we can schedule, do so, and make sure to yield.
@@ -341,37 +334,14 @@
nopage:
return NULL;
-}
-
-static inline zone_t * gfp_mask_to_zone (int gfp_mask)
-{
- zone_t *zone;
-
-#if CONFIG_HIGHMEM
- if (gfp_mask & __GFP_HIGHMEM)
- zone = zones + ZONE_HIGHMEM;
- else
-#endif
- if (gfp_mask & __GFP_DMA)
- zone = zones + ZONE_DMA;
- else
- zone = zones + ZONE_NORMAL;
- return zone;
-}
-unsigned long __get_free_pages (int gfp_mask, unsigned long order)
-{
- struct page *page;
-
- page = __get_pages(gfp_mask_to_zone(gfp_mask), gfp_mask, order);
- if (!page)
- return 0;
- return page_address(page);
-}
-
-struct page * alloc_pages (int gfp_mask, unsigned long order)
-{
- return __get_pages(gfp_mask_to_zone(gfp_mask), gfp_mask, order);
+/*
+ * The main chunk of the balancing code is in this offline branch:
+ */
+balance:
+ if (!balance_memory(gfp_mask))
+ goto nopage;
+ goto ready;
}
/*
@@ -383,7 +353,7 @@
zone_t *zone;
sum = 0;
- for (zone = zones; zone < zones+NR_ZONES; zone++)
+ for (zone = zones; zone < zones + MAX_NR_ZONES; zone++)
sum += zone->free_pages;
return sum;
}
@@ -420,8 +390,9 @@
unsigned type;
printk("Free pages: %6dkB (%6dkB HighMem)\n",
- nr_free_pages()<<(PAGE_SHIFT-10),
- nr_free_highpages()<<(PAGE_SHIFT-10));
+ nr_free_pages() << (PAGE_SHIFT-10),
+ nr_free_highpages() << (PAGE_SHIFT-10));
+
printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
nr_free_pages(),
nr_lru_pages,
@@ -429,25 +400,33 @@
freepages.low,
freepages.high);
- for (type = 0; type < NR_ZONES; type++) {
+ for (type = 0; type < MAX_NR_ZONES; type++) {
+ struct list_head *head, *curr;
zone_t *zone = zones + type;
- unsigned long total = 0;
+ unsigned long nr, total, flags;
printk(" %s: ", zone->name);
- for (order = 0; order < MAX_ORDER; order++) {
- unsigned long i, nr;
- nr = 0;
- for (i = 0; i < zone->size; i += 1<<order) {
- struct page * page;
- page = mem_map + zone->offset + i;
- if (!page_count(page))
+ total = 0;
+ if (zone->size) {
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ head = &(zone->free_area + order)->free_list;
+ curr = head;
+ nr = 0;
+ for (;;) {
+ curr = memlist_next(curr);
+ if (curr == head)
+ break;
nr++;
+ }
+ total += nr * (1 << order);
+ printk("%lu*%lukB ", nr,
+ (PAGE_SIZE>>10) << order);
}
- total += nr * ((PAGE_SIZE>>10) << order);
- printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
+ spin_unlock_irqrestore(&zone->lock, flags);
}
- printk("= %lukB)\n", total);
+ printk("= %lukB)\n", total * (PAGE_SIZE>>10));
}
#ifdef SWAP_CACHE_INFO
@@ -455,6 +434,55 @@
#endif
}
+/*
+ * Builds allocation fallback zone lists. We are basically ready
+ * to do NUMA-allocations, only this function has to be modified
+ * and the zonelists array be made per-CPU.
+ */
+static inline void build_zonelists (void)
+{
+ int i, j, k;
+
+ for (i = 0; i < NR_GFPINDEX; i++) {
+ zonelist_t *zonelist;
+ zone_t *zone;
+
+ zonelist = zonelists + i;
+ memset(zonelist, 0, sizeof(*zonelist));
+
+ zonelist->gfp_mask = i;
+ j = 0;
+ k = ZONE_NORMAL;
+ if (i & __GFP_HIGHMEM)
+ k = ZONE_HIGHMEM;
+ if (i & __GFP_DMA)
+ k = ZONE_DMA;
+
+ switch (k) {
+ default:
+ BUG();
+ /*
+ * fallthrough:
+ */
+ case ZONE_HIGHMEM:
+ zone = zones + ZONE_HIGHMEM;
+ if (zone->size) {
+#ifndef CONFIG_HIGHMEM
+ BUG();
+#endif
+ zonelist->zones[j++] = zone;
+ }
+ case ZONE_NORMAL:
+ zone = zones + ZONE_NORMAL;
+ if (zone->size)
+ zonelist->zones[j++] = zone;
+ case ZONE_DMA:
+ zonelist->zones[j++] = zones + ZONE_DMA;
+ }
+ zonelist->zones[j++] = NULL;
+ }
+}
+
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
/*
@@ -465,17 +493,18 @@
*/
void __init free_area_init(unsigned int *zones_size)
{
- mem_map_t * p;
+ struct page * p;
unsigned long i, j;
unsigned long map_size;
unsigned int totalpages, offset;
totalpages = 0;
- for (i = 0; i < NR_ZONES; i++)
- totalpages += zones_size[i];
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ unsigned long size = zones_size[i];
+ totalpages += size;
+ }
printk("totalpages: %08x\n", totalpages);
- i = totalpages >> 7;
/*
* Select nr of pages we try to keep free for important stuff
* with a minimum of 10 pages and a maximum of 256 pages, so
@@ -498,7 +527,6 @@
*/
map_size = totalpages*sizeof(struct page);
mem_map = (struct page *) alloc_bootmem(map_size);
- memset(mem_map, 0, map_size);
/*
* Initially all pages are reserved - free ones are freed
@@ -514,31 +542,48 @@
}
offset = 0;
- for (j = 0; j < NR_ZONES; j++) {
+ for (j = 0; j < MAX_NR_ZONES; j++) {
zone_t *zone = zones + j;
unsigned long mask = -1;
unsigned long size;
size = zones_size[j];
+
+ printk("zone(%ld): %ld pages.\n", j, size);
zone->size = size;
+ if (!size)
+ continue;
+
zone->offset = offset;
- zone->pages_low = freepages.low;
- zone->pages_high = freepages.high;
+ /*
+ * It's unnecessery to balance the high memory zone
+ */
+ if (j != ZONE_HIGHMEM) {
+ zone->pages_low = freepages.low;
+ zone->pages_high = freepages.high;
+ }
zone->low_on_memory = 0;
+ for (i = 0; i < size; i++) {
+ struct page *page = mem_map + offset + i;
+ page->zone = zone;
+ if (j != ZONE_HIGHMEM)
+ page->virtual = __page_address(page);
+ }
+
offset += size;
for (i = 0; i < MAX_ORDER; i++) {
unsigned long bitmap_size;
- unsigned int * map;
+
memlist_init(&zone->free_area[i].free_list);
mask += mask;
size = (size + ~mask) & mask;
bitmap_size = size >> i;
bitmap_size = (bitmap_size + 7) >> 3;
bitmap_size = LONG_ALIGN(bitmap_size);
- map = (unsigned int *) alloc_bootmem(bitmap_size);
- zone->free_area[i].map = map;
- memset((void *) map, 0, bitmap_size);
+ zone->free_area[i].map =
+ (unsigned int *) alloc_bootmem(bitmap_size);
}
}
+ build_zonelists();
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)