patch-2.4.6 linux/arch/ppc/mm/init.c

Next file: linux/arch/ppc/vmlinux.lds
Previous file: linux/arch/ppc/mm/hashtable.S
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.5/linux/arch/ppc/mm/init.c linux/arch/ppc/mm/init.c
@@ -1,5 +1,5 @@
 /*
- * BK Id: SCCS/s.init.c 1.22 05/17/01 18:14:23 cort
+ * BK Id: SCCS/s.init.c 1.27 06/28/01 15:50:17 paulus
  */
 /*
  *  PowerPC version 
@@ -73,9 +73,15 @@
 
 #define	PGTOKB(pages)	(((pages) * PAGE_SIZE) >> 10)
 
+mm_context_t next_mmu_context;
+unsigned long context_map[(LAST_CONTEXT+1) / (8*sizeof(unsigned long))];
+#ifdef FEW_CONTEXTS
+atomic_t nr_free_contexts;
+struct mm_struct *context_mm[LAST_CONTEXT+1];
+void steal_context(void);
+#endif /* FEW_CONTEXTS */
+
 int prom_trashed;
-atomic_t next_mmu_context;
-rwlock_t context_overflow_lock __cacheline_aligned = RW_LOCK_UNLOCKED;
 unsigned long *end_of_DRAM;
 unsigned long total_memory;
 unsigned long total_lowmem;
@@ -187,12 +193,6 @@
 /* max amount of low RAM to map in */
 unsigned long __max_low_memory = MAX_LOW_MEM;
 
-void __bad_pte(pmd_t *pmd)
-{
-	printk("Bad pmd in pte_alloc: %08lx\n", pmd_val(*pmd));
-	pmd_val(*pmd) = (unsigned long) BAD_PAGETABLE;
-}
-
 int do_check_pgt_cache(int low, int high)
 {
 	int freed = 0;
@@ -211,35 +211,6 @@
 	return freed;
 }
 
-/*
- * BAD_PAGE is the page that is used for page faults when linux
- * is out-of-memory. Older versions of linux just did a
- * do_exit(), but using this instead means there is less risk
- * for a process dying in kernel mode, possibly leaving a inode
- * unused etc..
- *
- * BAD_PAGETABLE is the accompanying page-table: it is initialized
- * to point to BAD_PAGE entries.
- *
- * ZERO_PAGE is a special page that is used for zero-initialized
- * data and COW.
- */
-pte_t *empty_bad_page_table;
-
-pte_t * __bad_pagetable(void)
-{
-	clear_page(empty_bad_page_table);
-	return empty_bad_page_table;
-}
-
-void *empty_bad_page;
-
-pte_t __bad_page(void)
-{
-	clear_page(empty_bad_page);
-	return pte_mkdirty(mk_pte_phys(__pa(empty_bad_page), PAGE_SHARED));
-}
-
 void show_mem(void)
 {
 	int i,free = 0,total = 0,reserved = 0;
@@ -319,21 +290,10 @@
 
 void si_meminfo(struct sysinfo *val)
 {
-	int i;
-
-	i = max_mapnr;
-	val->totalram = 0;
+	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = nr_free_pages();
 	val->bufferram = atomic_read(&buffermem_pages);
-	while (i-- > 0)  {
-		if (PageReserved(mem_map+i))
-			continue;
-		val->totalram++;
-		if (!atomic_read(&mem_map[i].count))
-			continue;
-		val->sharedram += atomic_read(&mem_map[i].count) - 1;
-	}
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
@@ -482,14 +442,16 @@
 	if (pg != 0) {
 		err = 0;
 		set_pte(pg, mk_pte_phys(pa & PAGE_MASK, __pgprot(flags)));
-		if (mem_init_done)
-			flush_hash_page(0, va);
+#if !defined(CONFIG_4xx) && !defined(CONFIG_8xx)
+		if (mem_init_done && Hash != 0)
+			flush_hash_page(0, va, pg);
+#endif /* !4xx && !8xx */
 	}
 	spin_unlock(&init_mm.page_table_lock);
 	return err;
 }
 
-#ifndef CONFIG_8xx
+#if !defined(CONFIG_4xx) && !defined(CONFIG_8xx)
 /*
  * TLB flushing:
  *
@@ -510,19 +472,14 @@
 void
 local_flush_tlb_all(void)
 {
-#ifdef CONFIG_PPC64BRIDGE
-	/* XXX this assumes that the vmalloc arena starts no lower than
-	 * 0xd0000000 on 64-bit machines. */
-	flush_hash_segments(0xd, 0xffffff);
-#else
-	/* this could cause problems on SMP with nobats  -- paulus */
-	/* XXX no hash_table_lock?  interesting  -- paulus */
-	__clear_user(Hash, Hash_size);
-	_tlbia();
+	/* aargh!!! */
+	/* just flush the kernel part of the address space, that's
+	   all that the current callers of this require.  -- paulus. */
+	local_flush_tlb_range(&init_mm, TASK_SIZE, ~0UL);
+
 #ifdef CONFIG_SMP
 	smp_send_tlb_invalidate(0);
 #endif /* CONFIG_SMP */
-#endif /* CONFIG_PPC64BRIDGE */
 }
 
 /*
@@ -533,30 +490,18 @@
 void
 local_flush_tlb_mm(struct mm_struct *mm)
 {
-	if (mm->context == 0) {
-		/* don't try to reassign a new context to the kernel */
-		/*
-		 * This could cause problems on SMP if we aren't using
-		 * the BATs (e.g. on POWER4 or if the nobats option is used).
-		 * The problem scenario is that one cpu is doing
-		 * flush_hash_page or similar when another cpu clears
-		 * out the HPTEs which map the flush_hash_page text
-		 * and the hash table.  hash_page will then deadlock.
-		 * We need some way to have "protected" HPTEs or else
-		 * do all hash-table manipulation with the MMU off.
-		 *  -- paulus.
-		 */
-#ifdef CONFIG_PPC64BRIDGE
-		flush_hash_segments(0xd, 0xf);
-#else
-		flush_hash_segments(0xc, 0xf);
-#endif CONFIG_PPC64BRIDGE
+	if (Hash == 0) {
 		_tlbia();
 		return;
 	}
-	mm->context = NO_CONTEXT;
-	if (mm == current->mm)
-		activate_mm(mm, mm);
+
+	if (mm->map_count) {
+		struct vm_area_struct *mp;
+		for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+			local_flush_tlb_range(mm, mp->vm_start, mp->vm_end);
+	} else
+		local_flush_tlb_range(mm, 0, TASK_SIZE);
+
 #ifdef CONFIG_SMP
 	smp_send_tlb_invalidate(0);
 #endif	
@@ -565,10 +510,21 @@
 void
 local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
-	if (vmaddr < TASK_SIZE)
-		flush_hash_page(vma->vm_mm->context, vmaddr);
-	else
-		flush_hash_page(0, vmaddr);
+	struct mm_struct *mm;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if (Hash == 0) {
+		_tlbie(vmaddr);
+		return;
+	}
+	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
+	pmd = pmd_offset(pgd_offset(mm, vmaddr), vmaddr);
+	if (!pmd_none(*pmd)) {
+		pte = pte_offset(pmd, vmaddr);
+		if (pte_val(*pte) & _PAGE_HASHPTE)
+			flush_hash_page(mm->context, vmaddr, pte);
+	}
 #ifdef CONFIG_SMP
 	smp_send_tlb_invalidate(0);
 #endif	
@@ -576,28 +532,43 @@
 
 
 /*
- * for each page addr in the range, call MMU_invalidate_page()
- * if the range is very large and the hash table is small it might be
- * faster to do a search of the hash table and just invalidate pages
- * that are in the range but that's for study later.
- * -- Cort
+ * For each address in the range, find the pte for the address
+ * and check _PAGE_HASHPTE bit; if it is set, find and destroy
+ * the corresponding HPTE.
  */
 void
 local_flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end)
 {
-	start &= PAGE_MASK;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long pmd_end;
+	unsigned int ctx = mm->context;
 
-	if (mm->context != 0) {
-		if (end > TASK_SIZE)
-			end = TASK_SIZE;
-		if (end - start > 20 * PAGE_SIZE) {
-			flush_tlb_mm(mm);
-			return;
-		}
+	if (Hash == 0) {
+		_tlbia();
+		return;
 	}
-
-	for (; start < end; start += PAGE_SIZE)
-		flush_hash_page(mm->context, start);
+	start &= PAGE_MASK;
+	if (start >= end)
+		return;
+	pmd = pmd_offset(pgd_offset(mm, start), start);
+	do {
+		pmd_end = (start + PGDIR_SIZE) & PGDIR_MASK;
+		if (!pmd_none(*pmd)) {
+			if (!pmd_end || pmd_end > end)
+				pmd_end = end;
+			pte = pte_offset(pmd, start);
+			do {
+				if ((pte_val(*pte) & _PAGE_HASHPTE) != 0)
+					flush_hash_page(ctx, start, pte);
+				start += PAGE_SIZE;
+				++pte;
+			} while (start && start < pmd_end);
+		} else {
+			start = pmd_end;
+		}
+		++pmd;
+	} while (start && start < end);
 
 #ifdef CONFIG_SMP
 	smp_send_tlb_invalidate(0);
@@ -605,59 +576,6 @@
 }
 
 /*
- * The context counter has overflowed.
- * We set mm->context to NO_CONTEXT for all mm's in the system.
- * We assume we can get to all mm's by looking as tsk->mm for
- * all tasks in the system.
- */
-void
-mmu_context_overflow(void)
-{
-	struct task_struct *tsk;
-
-	printk(KERN_DEBUG "mmu_context_overflow\n");
-	/* acquire the write lock for context overflow */
-	write_lock (&context_overflow_lock);
-	/* recheck if overflow still exists */
-	if (atomic_read(&next_mmu_context) == LAST_CONTEXT) {
-		read_lock(&tasklist_lock);
-		for_each_task(tsk) {
-			if (tsk->mm)
-				tsk->mm->context = NO_CONTEXT;
-		}
-		read_unlock(&tasklist_lock);
-		flush_hash_segments(0x10, 0xffffff);
-#ifdef CONFIG_SMP
-		smp_send_tlb_invalidate(0);
-#endif	
-		atomic_set(&next_mmu_context, 0);
-	}
-	write_unlock (&context_overflow_lock);
-	/* make sure current always has a context */
-	/* need to check to assure current task has an mm */
-	/*   - idle thread does not have an MM */
-	if (current->mm) {
-		current->mm->context = MUNGE_CONTEXT(atomic_inc_return(&next_mmu_context));
-		set_context(current->mm->context, current->mm->pgd);
-	}
-}
-#else /* CONFIG_8xx */
-void
-mmu_context_overflow(void)
-{
-	atomic_set(&next_mmu_context, -1);
-}
-#endif /* CONFIG_8xx */
-
-void flush_page_to_ram(struct page *page)
-{
-	unsigned long vaddr = (unsigned long) kmap(page);
-	__flush_page_to_ram(vaddr);
-	kunmap(page);
-}
-
-#if !defined(CONFIG_4xx) && !defined(CONFIG_8xx)
-/*
  * Set up one of the I/D BAT (block address translation) register pairs.
  * The parameters are not checked; in particular size must be a power
  * of 2 between 128k and 256M.
@@ -717,8 +635,6 @@
 /*
  * Map in all of physical memory starting at KERNELBASE.
  */
-#define PAGE_KERNEL_RO	__pgprot(_PAGE_PRESENT | _PAGE_ACCESSED)
-
 static void __init mapin_ram(void)
 {
 	unsigned long v, p, s, f;
@@ -768,10 +684,10 @@
 		f = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_SHARED;
 #if defined(CONFIG_KGDB) || defined(CONFIG_XMON)
 		/* Allows stub to set breakpoints everywhere */
-		f |= _PAGE_RW | _PAGE_DIRTY | _PAGE_HWWRITE;
+		f |= _PAGE_RW | _PAGE_DIRTY;
 #else
 		if ((char *) v < _stext || (char *) v >= etext)
-			f |= _PAGE_RW | _PAGE_DIRTY | _PAGE_HWWRITE;
+			f |= _PAGE_RW | _PAGE_DIRTY;
 #ifndef CONFIG_8xx
 		else
 			/* On the powerpc (not 8xx), no user access
@@ -839,16 +755,59 @@
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
+	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
 		set_page_count(virt_to_page(start), 1);
 		free_page(start);
 		totalram_pages++;
 	}
-	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 }
 #endif
 
+/*
+ * Initialize the context management stuff.
+ */
+static void mmu_context_init(void)
+{
+	context_map[0] = 1;	/* init_mm uses context 0 */
+	next_mmu_context = 1;
+#ifdef FEW_CONTEXTS
+	atomic_set(&nr_free_contexts, LAST_CONTEXT);
+	context_mm[0] = &init_mm;
+#endif /* FEW_CONTEXTS */
+}
+
+#ifdef FEW_CONTEXTS
+/*
+ * Steal a context from a task that has one at the moment.
+ * This is only used on 8xx and 4xx and we presently assume that
+ * they don't do SMP.  If they do then this will have to check
+ * whether the MM we steal is in use.
+ * We also assume that this is only used on systems that don't
+ * use an MMU hash table - this is true for 8xx and 4xx.
+ * This isn't an LRU system, it just frees up each context in
+ * turn (sort-of pseudo-random replacement :).  This would be the
+ * place to implement an LRU scheme if anyone was motivated to do it.
+ *  -- paulus
+ */
+void steal_context(void)
+{
+	struct mm_struct *mm;
+
+	/* free up context `next_mmu_context' */
+	/* if we shouldn't free context 0, don't... */
+#ifdef CONFIG_4xx
+	if (next_mmu_context == 0)
+		next_mmu_context = 1;
+#endif /* CONFIG_4xx */
+	mm = context_mm[next_mmu_context];
+	flush_tlb_mm(mm);
+	destroy_context(mm);
+}
+#endif /* FEW_CONTEXTS */
+
 extern boot_infos_t *disp_bi;
 
 /*
@@ -903,6 +862,8 @@
 
         mtspr(SPRN_DCCR, 0x80000000);	/* 128 MB of data space at 0x0. */
         mtspr(SPRN_ICCR, 0x80000000);	/* 128 MB of instr. space at 0x0. */
+
+	mmu_context_init();
 }
 
 #else /* !CONFIG_4xx */
@@ -938,11 +899,7 @@
 	/* Map in all of RAM starting at KERNELBASE */
 	mapin_ram();
 
-#if defined(CONFIG_POWER4)
-	ioremap_base = ioremap_bot = 0xfffff000;
-	isa_io_base = (unsigned long) ioremap(0xffd00000, 0x200000) + 0x100000;
-
-#elif defined(CONFIG_8xx)
+#if defined(CONFIG_8xx)
         /* Now map in some of the I/O space that is generically needed
          * or shared with multiple devices.
          * All of this fits into the same 4Mbyte region, so it only
@@ -974,7 +931,7 @@
 #ifdef CONFIG_PCI
         ioremap(PCI_CSR_ADDR, PCI_CSR_SIZE);
 #endif
-#else /* !CONFIG_POWER4 && !CONFIG_8xx */
+#else /* !CONFIG_8xx */
 	/*
 	 * Setup the bat mappings we're going to load that cover
 	 * the io areas.  RAM was mapped by mapin_ram().
@@ -1024,7 +981,7 @@
 		break;
 	}
 	ioremap_bot = ioremap_base;
-#endif /* CONFIG_POWER4 || CONFIG_8xx */
+#endif /* CONFIG_8xx */
 
 	if ( ppc_md.progress ) ppc_md.progress("MMU:exit", 0x211);
 #ifdef CONFIG_BOOTX_TEXT
@@ -1032,6 +989,8 @@
 	if (_machine == _MACH_Pmac || _machine == _MACH_chrp)
 		map_bootx_text();
 #endif
+
+	mmu_context_init();
 }
 #endif /* CONFIG_4xx */
 
@@ -1095,12 +1054,6 @@
 #endif /* CONFIG_HIGHMEM */
 
 	/*
-	 * Grab some memory for bad_page and bad_pagetable to use.
-	 */
-	empty_bad_page = alloc_bootmem_pages(PAGE_SIZE);
-	empty_bad_page_table = alloc_bootmem_pages(PAGE_SIZE);
-
-	/*
 	 * All pages are DMA-able so we put them all in the DMA zone.
 	 */
 	zones_size[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
@@ -1128,7 +1081,6 @@
 	highmem_mapnr = total_lowmem >> PAGE_SHIFT;
 	highmem_start_page = mem_map + highmem_mapnr;
 	max_mapnr = total_memory >> PAGE_SHIFT;
-	totalram_pages += max_mapnr - highmem_mapnr;
 #else
 	max_mapnr = max_low_pfn;
 #endif /* CONFIG_HIGHMEM */
@@ -1201,15 +1153,17 @@
 
 #if !defined(CONFIG_4xx) && !defined(CONFIG_8xx)
 /*
- * Initialize the hash table and patch the instructions in head.S.
+ * Initialize the hash table and patch the instructions in hashtable.S.
  */
 static void __init hash_init(void)
 {
 	int Hash_bits, mb, mb2;
 	unsigned int hmask, h;
 
-	extern unsigned int hash_page_patch_A[], hash_page_patch_B[],
-		hash_page_patch_C[], hash_page[];
+	extern unsigned int hash_page_patch_A[];
+	extern unsigned int hash_page_patch_B[], hash_page_patch_C[];
+	extern unsigned int hash_page[];
+	extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
 
 #ifdef CONFIG_PPC64BRIDGE
 	/* The hash table has already been allocated and initialized
@@ -1259,6 +1213,7 @@
 	if ( Hash_size ) {
 		Hash = mem_pieces_find(Hash_size, Hash_size);
 		cacheable_memzero(Hash, Hash_size);
+		_SDR1 = __pa(Hash) | (Hash_mask >> 10);
 	} else
 		Hash = 0;
 #endif /* CONFIG_PPC64BRIDGE */
@@ -1271,10 +1226,10 @@
 		Hash_end = (PTE *) ((unsigned long)Hash + Hash_size);
 
 		/*
-		 * Patch up the instructions in head.S:hash_page
+		 * Patch up the instructions in hashtable.S:create_hpte
 		 */
 		hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff)
-			| (__pa(Hash) >> 16);
+			| ((unsigned int)(Hash) >> 16);
 		hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0)
 			| (mb << 6);
 		hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0)
@@ -1283,10 +1238,6 @@
 			| hmask;
 		hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff)
 			| hmask;
-#if 0	/* see hash_page in head.S, note also patch_C ref below */
-		hash_page_patch_D[0] = (hash_page_patch_D[0] & ~0xffff)
-			| hmask;
-#endif
 		/*
 		 * Ensure that the locations we've patched have been written
 		 * out from the data cache and invalidated in the instruction
@@ -1294,6 +1245,19 @@
 		 */
 		flush_icache_range((unsigned long) &hash_page_patch_A[0],
 				   (unsigned long) &hash_page_patch_C[1]);
+		/*
+		 * Patch up the instructions in hashtable.S:flush_hash_page
+		 */
+		flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff)
+			| ((unsigned int)(Hash) >> 16);
+		flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0)
+			| (mb << 6);
+		flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0)
+			| (mb2 << 6);
+		flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff)
+			| hmask;
+		flush_icache_range((unsigned long) &flush_hash_patch_A[0],
+				   (unsigned long) &flush_hash_patch_B[1]);
 	}
 	else {
 		Hash_end = 0;
@@ -1306,6 +1270,7 @@
 		flush_icache_range((unsigned long) &hash_page[0],
 				   (unsigned long) &hash_page[1]);
 	}
+
 	if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
 }
 #endif /* !CONFIG_4xx && !CONFIG_8xx */
@@ -1320,7 +1285,7 @@
 	unsigned long kstart, ksize;
 
 	/*
-	 * Initially, available phyiscal memory is equivalent to all
+	 * Initially, available physical memory is equivalent to all
 	 * physical memory.
 	 */
 
@@ -1356,4 +1321,51 @@
 	if (Hash)
 		mem_pieces_remove(&phys_avail, __pa(Hash), Hash_size, 1);
 #endif /* CONFIG_PPC64BRIDGE */
+}
+
+void flush_page_to_ram(struct page *page)
+{
+	unsigned long vaddr = (unsigned long) kmap(page);
+	__flush_page_to_ram(vaddr);
+	kunmap(page);
+}
+
+#if !defined(CONFIG_4xx) && !defined(CONFIG_8xx)
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+		      pte_t pte)
+{
+	struct mm_struct *mm;
+	pmd_t *pmd;
+	pte_t *ptep;
+	static int nopreload;
+
+	if (Hash == 0 || nopreload)
+		return;
+	mm = (address < TASK_SIZE)? vma->vm_mm: &init_mm;
+	pmd = pmd_offset(pgd_offset(mm, address), address);
+	if (!pmd_none(*pmd)) {
+		ptep = pte_offset(pmd, address);
+		add_hash_page(mm->context, address, ptep);
+	}
+}
+#endif /* !4xx && !8xx */
+
+/*
+ * set_pte stores a linux PTE into the linux page table.
+ * On machines which use an MMU hash table we avoid changing the
+ * _PAGE_HASHPTE bit.
+ */
+void set_pte(pte_t *ptep, pte_t pte)
+{
+#if _PAGE_HASHPTE != 0
+	pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
+#else
+	*ptep = pte;
+#endif
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)