patch-2.3.20 linux/arch/i386/kernel/smp.c

Next file: linux/arch/i386/kernel/smpboot.c
Previous file: linux/arch/i386/kernel/pci-visws.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.19/linux/arch/i386/kernel/smp.c linux/arch/i386/kernel/smp.c
@@ -1,1427 +1,108 @@
 /*
- *	Intel MP v1.1/v1.4 specification support routines for multi-pentium
- *	hosts.
+ *	Intel SMP support routines.
  *
  *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *	(c) 1998 Ingo Molnar
+ *	(c) 1998-99 Ingo Molnar <mingo@redhat.com>
  *
- *	Supported by Caldera http://www.caldera.com.
- *	Much of the core SMP work is based on previous work by Thomas Radke, to
- *	whom a great many thanks are extended.
- *
- *	Thanks to Intel for making available several different Pentium,
- *	Pentium Pro and Pentium-II/Xeon MP machines.
- *
- *	This code is released under the GNU public license version 2 or
- *	later.
- *
- *	Fixes
- *		Felix Koop	:	NR_CPUS used properly
- *		Jose Renau	:	Handle single CPU case.
- *		Alan Cox	:	By repeated request 8) - Total BogoMIP report.
- *		Greg Wright	:	Fix for kernel stacks panic.
- *		Erich Boleyn	:	MP v1.4 and additional changes.
- *	Matthias Sattler	:	Changes for 2.1 kernel map.
- *	Michel Lespinasse	:	Changes for 2.1 kernel map.
- *	Michael Chastain	:	Change trampoline.S to gnu as.
- *		Alan Cox	:	Dumb bug: 'B' step PPro's are fine
- *		Ingo Molnar	:	Added APIC timers, based on code
- *					from Jose Renau
- *		Alan Cox	:	Added EBDA scanning
- *		Ingo Molnar	:	various cleanups and rewrites
- *		Tigran Aivazian	:	fixed "0.00 in /proc/uptime on SMP" bug.
- */
-
-#include <linux/config.h>
-#include <linux/mm.h>
-#include <linux/kernel_stat.h>
-#include <linux/delay.h>
-#include <linux/mc146818rtc.h>
-#include <linux/smp_lock.h>
-#include <linux/init.h>
-#include <asm/mtrr.h>
-#include <asm/msr.h>
-
-#include <linux/irq.h>
-
-#define JIFFIE_TIMEOUT 100
-
-extern void update_one_process( struct task_struct *p,
-				unsigned long ticks, unsigned long user,
-				unsigned long system, int cpu);
-/*
- *	Some notes on processor bugs:
- *
- *	Pentium and Pentium Pro (and all CPUs) have bugs. The Linux issues
- *	for SMP are handled as follows.
- *
- *	Pentium Pro
- *		Occasional delivery of 'spurious interrupt' as trap #16. This
- *	is very rare. The kernel logs the event and recovers
- *
- *	Pentium
- *		There is a marginal case where REP MOVS on 100MHz SMP
- *	machines with B stepping processors can fail. XXX should provide
- *	an L1cache=Writethrough or L1cache=off option.
- *
- *		B stepping CPUs may hang. There are hardware work arounds
- *	for this. We warn about it in case your board doesnt have the work
- *	arounds. Basically thats so I can tell anyone with a B stepping
- *	CPU and SMP problems "tough".
- *
- *	Specific items [From Pentium Processor Specification Update]
- *
- *	1AP.	Linux doesn't use remote read
- *	2AP.	Linux doesn't trust APIC errors
- *	3AP.	We work around this
- *	4AP.	Linux never generated 3 interrupts of the same priority
- *		to cause a lost local interrupt.
- *	5AP.	Remote read is never used
- *	9AP.	XXX NEED TO CHECK WE HANDLE THIS XXX
- *	10AP.	XXX NEED TO CHECK WE HANDLE THIS XXX
- *	11AP.	Linux reads the APIC between writes to avoid this, as per
- *		the documentation. Make sure you preserve this as it affects
- *		the C stepping chips too.
- *
- *	If this sounds worrying believe me these bugs are ___RARE___ and
- *	there's about nothing of note with C stepping upwards.
- */
-
-
-/* Kernel spinlock */
-spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
-
-/*
- * function prototypes:
- */
-static void cache_APIC_registers (void);
-static void stop_this_cpu (void);
-
-static int smp_b_stepping = 0;				/* Set if we find a B stepping CPU			*/
-
-static int max_cpus = -1;				/* Setup configured maximum number of CPUs to activate	*/
-int smp_found_config=0;					/* Have we found an SMP box 				*/
-
-unsigned long cpu_present_map = 0;			/* Bitmask of physically existing CPUs 				*/
-unsigned long cpu_online_map = 0;			/* Bitmask of currently online CPUs 				*/
-int smp_num_cpus = 0;					/* Total count of live CPUs 				*/
-int smp_threads_ready=0;				/* Set when the idlers are all forked 			*/
-volatile int cpu_number_map[NR_CPUS];			/* which CPU maps to which logical number		*/
-volatile int __cpu_logical_map[NR_CPUS];			/* which logical number maps to which CPU		*/
-static volatile unsigned long cpu_callin_map[NR_CPUS] = {0,};	/* We always use 0 the rest is ready for parallel delivery */
-static volatile unsigned long cpu_callout_map[NR_CPUS] = {0,};	/* We always use 0 the rest is ready for parallel delivery */
-volatile unsigned long smp_invalidate_needed;		/* Used for the invalidate map that's also checked in the spinlock */
-volatile unsigned long kstack_ptr;			/* Stack vector for booting CPUs			*/
-struct cpuinfo_x86 cpu_data[NR_CPUS];			/* Per CPU bogomips and other parameters 		*/
-static unsigned int num_processors = 1;			/* Internal processor count				*/
-unsigned long mp_ioapic_addr = 0xFEC00000;		/* Address of the I/O apic (not yet used) 		*/
-unsigned char boot_cpu_id = 0;				/* Processor that is doing the boot up 			*/
-static int smp_activated = 0;				/* Tripped once we need to start cross invalidating 	*/
-int apic_version[NR_CPUS];				/* APIC version number					*/
-unsigned long apic_retval;				/* Just debugging the assembler.. 			*/
-
-volatile unsigned long kernel_counter=0;		/* Number of times the processor holds the lock		*/
-volatile unsigned long syscall_count=0;			/* Number of times the processor holds the syscall lock	*/
-
-volatile unsigned long ipi_count;			/* Number of IPIs delivered				*/
-
-const char lk_lockmsg[] = "lock from interrupt context at %p\n"; 
-
-int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, };
-extern int nr_ioapics;
-extern struct mpc_config_ioapic mp_apics [MAX_IO_APICS];
-extern int mp_irq_entries;
-extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
-extern int mpc_default_type;
-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { -1, };
-int mp_current_pci_id = 0;
-unsigned long mp_lapic_addr = 0;
-int skip_ioapic_setup = 0;				/* 1 if "noapic" boot option passed */
-
-/* #define SMP_DEBUG */
-
-#ifdef SMP_DEBUG
-#define SMP_PRINTK(x)	printk x
-#else
-#define SMP_PRINTK(x)
-#endif
-
-/*
- * IA s/w dev Vol 3, Section 7.4
- */
-#define APIC_DEFAULT_PHYS_BASE 0xfee00000
-
-#define CLEAR_TSC wrmsr(0x10, 0x00001000, 0x00001000)
-
-/*
- *	Setup routine for controlling SMP activation
- *
- *	Command-line option of "nosmp" or "maxcpus=0" will disable SMP
- *      activation entirely (the MPS table probe still happens, though).
- *
- *	Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
- *	greater than 0, limits the maximum number of CPUs activated in
- *	SMP mode to <NUM>.
- */
-
-static int __init nosmp(char *str)
-{
-	max_cpus = 0;
-	return 1;
-}
-
-__setup("nosmp", nosmp);
-
-static int __init maxcpus(char *str)
-{
-	get_option(&str, &max_cpus);
-	return 1;
-}
-
-__setup("maxcpus=", maxcpus);
-
-void ack_APIC_irq(void)
-{
-	/* Clear the IPI */
-
-	/* Dummy read */
-	apic_read(APIC_SPIV);
-
-	/* Docs say use 0 for future compatibility */
-	apic_write(APIC_EOI, 0);
-}
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
-#ifndef CONFIG_X86_VISWS_APIC
-/*
- *	Checksum an MP configuration block.
- */
-
-static int mpf_checksum(unsigned char *mp, int len)
-{
-	int sum=0;
-	while(len--)
-		sum+=*mp++;
-	return sum&0xFF;
-}
-
-/*
- *	Processor encoding in an MP configuration block
- */
-
-static char *mpc_family(int family,int model)
-{
-	static char n[32];
-	static char *model_defs[]=
-	{
-		"80486DX","80486DX",
-		"80486SX","80486DX/2 or 80487",
-		"80486SL","Intel5X2(tm)",
-		"Unknown","Unknown",
-		"80486DX/4"
-	};
-	if (family==0x6)
-		return("Pentium(tm) Pro");
-	if (family==0x5)
-		return("Pentium(tm)");
-	if (family==0x0F && model==0x0F)
-		return("Special controller");
-	if (family==0x04 && model<9)
-		return model_defs[model];
-	sprintf(n,"Unknown CPU [%d:%d]",family, model);
-	return n;
-}
-
-
-/*
- *	Read the MPC
- */
-
-static int __init smp_read_mpc(struct mp_config_table *mpc)
-{
-	char str[16];
-	int count=sizeof(*mpc);
-	int ioapics = 0;
-	unsigned char *mpt=((unsigned char *)mpc)+count;
-
-	if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4))
-	{
-		panic("SMP mptable: bad signature [%c%c%c%c]!\n",
-			mpc->mpc_signature[0],
-			mpc->mpc_signature[1],
-			mpc->mpc_signature[2],
-			mpc->mpc_signature[3]);
-		return 1;
-	}
-	if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length))
-	{
-		panic("SMP mptable: checksum error!\n");
-		return 1;
-	}
-	if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04)
-	{
-		printk("Bad Config Table version (%d)!!\n",mpc->mpc_spec);
-		return 1;
-	}
-	memcpy(str,mpc->mpc_oem,8);
-	str[8]=0;
-	printk("OEM ID: %s ",str);
-	
-	memcpy(str,mpc->mpc_productid,12);
-	str[12]=0;
-	printk("Product ID: %s ",str);
-
-	printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
-
-	/* save the local APIC address, it might be non-default */
-	mp_lapic_addr = mpc->mpc_lapic;
-
-	/*
-	 *	Now process the configuration blocks.
-	 */
-	
-	while(count<mpc->mpc_length)
-	{
-		switch(*mpt)
-		{
-			case MP_PROCESSOR:
-			{
-				struct mpc_config_processor *m=
-					(struct mpc_config_processor *)mpt;
-				if (m->mpc_cpuflag&CPU_ENABLED)
-				{
-					printk("Processor #%d %s APIC version %d\n",
-						m->mpc_apicid,
-						mpc_family((m->mpc_cpufeature&
-							CPU_FAMILY_MASK)>>8,
-							(m->mpc_cpufeature&
-								CPU_MODEL_MASK)>>4),
-						m->mpc_apicver);
-#ifdef SMP_DEBUG
-					if (m->mpc_featureflag&(1<<0))
-						printk("    Floating point unit present.\n");
-					if (m->mpc_featureflag&(1<<7))
-						printk("    Machine Exception supported.\n");
-					if (m->mpc_featureflag&(1<<8))
-						printk("    64 bit compare & exchange supported.\n");
-					if (m->mpc_featureflag&(1<<9))
-						printk("    Internal APIC present.\n");
-#endif
-					if (m->mpc_cpuflag&CPU_BOOTPROCESSOR)
-					{
-						SMP_PRINTK(("    Bootup CPU\n"));
-						boot_cpu_id=m->mpc_apicid;
-					}
-					else	/* Boot CPU already counted */
-						num_processors++;
-
-					if (m->mpc_apicid>NR_CPUS)
-						printk("Processor #%d unused. (Max %d processors).\n",m->mpc_apicid, NR_CPUS);
-					else
-					{
-						int ver = m->mpc_apicver;
-
-						cpu_present_map|=(1<<m->mpc_apicid);
-						/*
-						 * Validate version
-						 */
-						if (ver == 0x0) {
-							printk("BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
-							ver = 0x10;
-						}
-						apic_version[m->mpc_apicid] = ver;
-					}
-				}
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
-				break;
-			}
-			case MP_BUS:
-			{
-				struct mpc_config_bus *m=
-					(struct mpc_config_bus *)mpt;
-				memcpy(str,m->mpc_bustype,6);
-				str[6]=0;
-				SMP_PRINTK(("Bus #%d is %s\n",
-					m->mpc_busid,
-					str));
-				if (strncmp(m->mpc_bustype,"ISA",3) == 0)
-					mp_bus_id_to_type[m->mpc_busid] =
-						MP_BUS_ISA;
-				else
-				if (strncmp(m->mpc_bustype,"EISA",4) == 0)
-					mp_bus_id_to_type[m->mpc_busid] =
-						MP_BUS_EISA;
-				if (strncmp(m->mpc_bustype,"PCI",3) == 0) {
-					mp_bus_id_to_type[m->mpc_busid] =
-						MP_BUS_PCI;
-					mp_bus_id_to_pci_bus[m->mpc_busid] =
-						mp_current_pci_id;
-					mp_current_pci_id++;
-				}
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
-				break;
-			}
-			case MP_IOAPIC:
-			{
-				struct mpc_config_ioapic *m=
-					(struct mpc_config_ioapic *)mpt;
-				if (m->mpc_flags&MPC_APIC_USABLE)
-				{
-					ioapics++;
-					printk("I/O APIC #%d Version %d at 0x%lX.\n",
-						m->mpc_apicid,m->mpc_apicver,
-						m->mpc_apicaddr);
-					mp_apics [nr_ioapics] = *m;
-					if (++nr_ioapics > MAX_IO_APICS)
-						--nr_ioapics;
-				}
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
-				break;
-			}
-			case MP_INTSRC:
-			{
-				struct mpc_config_intsrc *m=
-					(struct mpc_config_intsrc *)mpt;
-
-				mp_irqs [mp_irq_entries] = *m;
-				if (++mp_irq_entries == MAX_IRQ_SOURCES) {
-					printk("Max irq sources exceeded!!\n");
-					printk("Skipping remaining sources.\n");
-					--mp_irq_entries;
-				}
-
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
-				break;
-			}
-			case MP_LINTSRC:
-			{
-				struct mpc_config_intlocal *m=
-					(struct mpc_config_intlocal *)mpt;
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
-				break;
-			}
-		}
-	}
-	if (ioapics > MAX_IO_APICS)
-	{
-		printk("Warning: Max I/O APICs exceeded (max %d, found %d).\n", MAX_IO_APICS, ioapics);
-		printk("Warning: switching to non APIC mode.\n");
-		skip_ioapic_setup=1;
-	}
-	return num_processors;
-}
-
-/*
- *	Scan the memory blocks for an SMP configuration block.
- */
-
-static int __init smp_scan_config(unsigned long base, unsigned long length)
-{
-	unsigned long *bp=phys_to_virt(base);
-	struct intel_mp_floating *mpf;
-
-	SMP_PRINTK(("Scan SMP from %p for %ld bytes.\n",
-		bp,length));
-	if (sizeof(*mpf)!=16)
-		printk("Error: MPF size\n");
-
-	while (length>0)
-	{
-		if (*bp==SMP_MAGIC_IDENT)
-		{
-			mpf=(struct intel_mp_floating *)bp;
-			if (mpf->mpf_length==1 &&
-				!mpf_checksum((unsigned char *)bp,16) &&
-				(mpf->mpf_specification == 1
-				 || mpf->mpf_specification == 4) )
-			{
-				printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
-				if (mpf->mpf_feature2&(1<<7))
-					printk("    IMCR and PIC compatibility mode.\n");
-				else
-					printk("    Virtual Wire compatibility mode.\n");
-				smp_found_config=1;
-				/*
-				 *	Now see if we need to read further.
-				 */
-				if (mpf->mpf_feature1!=0)
-				{
-					unsigned long cfg;
-
-					/* local APIC has default address */
-					mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-					/*
-					 *	We need to know what the local
-					 *	APIC id of the boot CPU is!
-					 */
-
-/*
- *
- *	HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK
- *
- *	It's not just a crazy hack.  ;-)
- */
-					/*
-					 *	Standard page mapping
-					 *	functions don't work yet.
-					 *	We know that page 0 is not
-					 *	used.  Steal it for now!
-					 */
-			
-					cfg=pg0[0];
-					pg0[0] = (mp_lapic_addr | _PAGE_RW | _PAGE_PRESENT);
-					local_flush_tlb();
-
-					boot_cpu_id = GET_APIC_ID(*((volatile unsigned long *) APIC_ID));
-
-					/*
-					 *	Give it back
-					 */
-
-					pg0[0]= cfg;
-					local_flush_tlb();
-
-/*
- *
- *	END OF HACK   END OF HACK   END OF HACK   END OF HACK   END OF HACK
- *
- */
-					/*
-					 *	2 CPUs, numbered 0 & 1.
-					 */
-					cpu_present_map=3;
-					num_processors=2;
-					printk("I/O APIC at 0xFEC00000.\n");
-
-					/*
-					 * Save the default type number, we
-					 * need it later to set the IO-APIC
-					 * up properly:
-					 */
-					mpc_default_type = mpf->mpf_feature1;
-
-					printk("Bus #0 is ");
-				}
-				switch(mpf->mpf_feature1)
-				{
-					case 1:
-					case 5:
-						printk("ISA\n");
-						break;
-					case 2:
-						printk("EISA with no IRQ8 chaining\n");
-						break;
-					case 6:
-					case 3:
-						printk("EISA\n");
-						break;
-					case 4:
-					case 7:
-						printk("MCA\n");
-						break;
-					case 0:
-						break;
-					default:
-						printk("???\nUnknown standard configuration %d\n",
-							mpf->mpf_feature1);
-						return 1;
-				}
-				if (mpf->mpf_feature1>4)
-				{
-					printk("Bus #1 is PCI\n");
-
-					/*
-					 *	Set local APIC version to
-					 *	the integrated form.
-					 *	It's initialized to zero
-					 *	otherwise, representing
-					 *	a discrete 82489DX.
-					 */
-					apic_version[0] = 0x10;
-					apic_version[1] = 0x10;
-				}
-				/*
-				 *	Read the physical hardware table.
-				 *	Anything here will override the
-				 *	defaults.
-				 */
-				if (mpf->mpf_physptr)
-					smp_read_mpc((void *)mpf->mpf_physptr);
-
-				__cpu_logical_map[0] = boot_cpu_id;
-				global_irq_holder = boot_cpu_id;
-				current->processor = boot_cpu_id;
-
-				printk("Processors: %d\n", num_processors);
-				/*
-				 *	Only use the first configuration found.
-				 */
-				return 1;
-			}
-		}
-		bp+=4;
-		length-=16;
-	}
-
-	return 0;
-}
-
-void __init init_intel_smp (void)
-{
-	/*
-	 * FIXME: Linux assumes you have 640K of base ram..
-	 * this continues the error...
-	 *
-	 * 1) Scan the bottom 1K for a signature
-	 * 2) Scan the top 1K of base RAM
-	 * 3) Scan the 64K of bios
-	 */
-	if (!smp_scan_config(0x0,0x400) &&
-	    !smp_scan_config(639*0x400,0x400) &&
-	    !smp_scan_config(0xF0000,0x10000)) {
-		/*
-		 * If it is an SMP machine we should know now, unless the
-		 * configuration is in an EISA/MCA bus machine with an
-		 * extended bios data area. 
-		 *
-		 * there is a real-mode segmented pointer pointing to the
-		 * 4K EBDA area at 0x40E, calculate and scan it here.
-		 *
-		 * NOTE! There are Linux loaders that will corrupt the EBDA
-		 * area, and as such this kind of SMP config may be less
-		 * trustworthy, simply because the SMP table may have been
-		 * stomped on during early boot. These loaders are buggy and
-		 * should be fixed.
-		 */
-		unsigned int address;
-
-		address = *(unsigned short *)phys_to_virt(0x40E);
-		address<<=4;
-		smp_scan_config(address, 0x1000);
-		if (smp_found_config)
-			printk(KERN_WARNING "WARNING: MP table in the EBDA can be UNSAFE, contact linux-smp@vger.rutgers.edu if you experience SMP problems!\n");
-	}
-}
-
-#else
-
-/*
- * The Visual Workstation is Intel MP compliant in the hardware
- * sense, but it doesnt have a BIOS(-configuration table).
- * No problem for Linux.
- */
-void __init init_visws_smp(void)
-{
-	smp_found_config = 1;
-
-	cpu_present_map |= 2; /* or in id 1 */
-	apic_version[1] |= 0x10; /* integrated APIC */
-	apic_version[0] |= 0x10;
-
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-} 
-
-#endif
-
-/*
- * - Intel MP Configuration Table
- * - or SGI Visual Workstation configuration
- */
-void __init init_smp_config (void)
-{
-#ifndef CONFIG_VISWS
-	init_intel_smp();
-#else
-	init_visws_smp();
-#endif
-}
-
-
-
-/*
- *	Trampoline 80x86 program as an array.
- */
-
-extern unsigned char trampoline_data [];
-extern unsigned char trampoline_end  [];
-static unsigned char *trampoline_base;
-
-/*
- *	Currently trivial. Write the real->protected mode
- *	bootstrap into the page concerned. The caller
- *	has made sure it's suitably aligned.
- */
-
-static unsigned long __init setup_trampoline(void)
-{
-	memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
-	return virt_to_phys(trampoline_base);
-}
-
-/*
- *	We are called very early to get the low memory for the
- *	SMP bootup trampoline page.
- */
-unsigned long __init smp_alloc_memory(unsigned long mem_base)
-{
-	if (virt_to_phys((void *)mem_base) >= 0x9F000)
-		panic("smp_alloc_memory: Insufficient low memory for kernel trampoline 0x%lx.", mem_base);
-	trampoline_base = (void *)mem_base;
-	return mem_base + PAGE_SIZE;
-}
-
-/*
- *	The bootstrap kernel entry code has set these up. Save them for
- *	a given CPU
- */
-
-void __init smp_store_cpu_info(int id)
-{
-	struct cpuinfo_x86 *c=&cpu_data[id];
-
-	*c = boot_cpu_data;
-	c->pte_quick = 0;
-	c->pgd_quick = 0;
-	c->pgtable_cache_sz = 0;
-	identify_cpu(c);
-	/*
-	 *	Mask B, Pentium, but not Pentium MMX
-	 */
-	if (c->x86_vendor == X86_VENDOR_INTEL &&
-	    c->x86 == 5 &&
-	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
-	    c->x86_model <= 3)
-		smp_b_stepping=1;		/* Remember we have B step Pentia with bugs */
-}
-
-/*
- *	Architecture specific routine called by the kernel just before init is
- *	fired off. This allows the BP to have everything in order [we hope].
- *	At the end of this all the APs will hit the system scheduling and off
- *	we go. Each AP will load the system gdt's and jump through the kernel
- *	init into idle(). At this point the scheduler will one day take over
- * 	and give them jobs to do. smp_callin is a standard routine
- *	we use to track CPUs as they power up.
- */
-
-static atomic_t smp_commenced = ATOMIC_INIT(0);
-
-void __init smp_commence(void)
-{
-	/*
-	 *	Lets the callins below out of their loop.
-	 */
-	SMP_PRINTK(("Setting commenced=1, go go go\n"));
-
-	wmb();
-	atomic_set(&smp_commenced,1);
-}
-
-void __init enable_local_APIC(void)
-{
-	unsigned long value;
-
- 	value = apic_read(APIC_SPIV);
- 	value |= (1<<8);		/* Enable APIC (bit==1) */
-#if 0
- 	value &= ~(1<<9);		/* Enable focus processor (bit==0) */
-#else
-	value |= (1<<9);		/* Disable focus processor (bit==1) */
-#endif
-	value |= 0xff;			/* Set spurious IRQ vector to 0xff */
- 	apic_write(APIC_SPIV,value);
-
-	/*
-	 * Set Task Priority to 'accept all'
-	 */
- 	value = apic_read(APIC_TASKPRI);
- 	value &= ~APIC_TPRI_MASK;
- 	apic_write(APIC_TASKPRI,value);
-
-	/*
-	 * Clear the logical destination ID, just to be safe.
-	 * also, put the APIC into flat delivery mode.
-	 */
- 	value = apic_read(APIC_LDR);
-	value &= ~APIC_LDR_MASK;
- 	apic_write(APIC_LDR,value);
-
- 	value = apic_read(APIC_DFR);
-	value |= SET_APIC_DFR(0xf);
- 	apic_write(APIC_DFR, value);
-
-	udelay(100);			/* B safe */
-}
-
-unsigned long __init init_smp_mappings(unsigned long memory_start)
-{
-	unsigned long apic_phys;
-
-	memory_start = PAGE_ALIGN(memory_start);
-	if (smp_found_config) {
-		apic_phys = mp_lapic_addr;
-	} else {
-		/*
-		 * set up a fake all zeroes page to simulate the
-		 * local APIC and another one for the IO-APIC. We
-		 * could use the real zero-page, but it's safer
-		 * this way if some buggy code writes to this page ...
-		 */
-		apic_phys = __pa(memory_start);
-		memset((void *)memory_start, 0, PAGE_SIZE);
-		memory_start += PAGE_SIZE;
-	}
-	set_fixmap(FIX_APIC_BASE,apic_phys);
-	printk("mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys);
-
-#ifdef CONFIG_X86_IO_APIC
-	{
-		unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-		int i;
-
-		for (i = 0; i < nr_ioapics; i++) {
-			if (smp_found_config) {
-				ioapic_phys = mp_apics[i].mpc_apicaddr;
-			} else {
-				ioapic_phys = __pa(memory_start);
-				memset((void *)memory_start, 0, PAGE_SIZE);
-				memory_start += PAGE_SIZE;
-			}
-			set_fixmap(idx,ioapic_phys);
-			printk("mapped IOAPIC to %08lx (%08lx)\n",
-					__fix_to_virt(idx), ioapic_phys);
-			idx++;
-		}
-	}
-#endif
-
-	return memory_start;
-}
-
-extern void calibrate_delay(void);
-
-void __init smp_callin(void)
-{
-	int cpuid;
-	unsigned long timeout;
-
-	/*
-	 * (This works even if the APIC is not enabled.)
-	 */
-	cpuid = GET_APIC_ID(apic_read(APIC_ID));
-
-	SMP_PRINTK(("CPU#%d waiting for CALLOUT\n", cpuid));
-
-	/*
-	 * STARTUP IPIs are fragile beasts as they might sometimes
-	 * trigger some glue motherboard logic. Complete APIC bus
-	 * silence for 1 second, this overestimates the time the
-	 * boot CPU is spending to send the up to 2 STARTUP IPIs
-	 * by a factor of two. This should be enough.
-	 */
-
-	/*
-	 * Waiting 2s total for startup (udelay is not yet working)
-	 */
-	timeout = jiffies + 2*HZ;
-	while (time_before(jiffies,timeout))
-	{
-		/*
-		 * Has the boot CPU finished it's STARTUP sequence?
-		 */
-		if (test_bit(cpuid, (unsigned long *)&cpu_callout_map[0]))
-			break;
-	}
-
-	while (!time_before(jiffies,timeout)) {
-		printk("BUG: CPU%d started up but did not get a callout!\n",
-			cpuid);
-		stop_this_cpu();
-	}
-
-	/*
-	 * the boot CPU has finished the init stage and is spinning
-	 * on callin_map until we finish. We are free to set up this
-	 * CPU, first the APIC. (this is probably redundant on most
-	 * boards)
-	 */
-	
-	SMP_PRINTK(("CALLIN, before enable_local_APIC().\n"));
-	enable_local_APIC();
-
-	/*
-	 * Set up our APIC timer.
-	 */
-	setup_APIC_clock();
-
- 	__sti();
-
-#ifdef CONFIG_MTRR
-	/*  Must be done before calibration delay is computed  */
-	mtrr_init_secondary_cpu ();
-#endif
-	/*
-	 *	Get our bogomips.
-	 */
-	calibrate_delay();
-	SMP_PRINTK(("Stack at about %p\n",&cpuid));
-
-	/*
-	 *	Save our processor parameters
-	 */
- 	smp_store_cpu_info(cpuid);
-
-	/*
-	 *	Allow the master to continue.
-	 */
-	set_bit(cpuid, (unsigned long *)&cpu_callin_map[0]);
-}
-
-int cpucount = 0;
-
-extern int cpu_idle(void);
-
-/*
- *	Activate a secondary processor.
- */
-int __init start_secondary(void *unused)
-{
-	/*
-	 * Dont put anything before smp_callin(), SMP
-	 * booting is too fragile that we want to limit the
-	 * things done here to the most necessary things.
-	 */
-	cpu_init();
-	smp_callin();
-	while (!atomic_read(&smp_commenced))
-		/* nothing */ ;
-	return cpu_idle();
-}
-
-/*
- * Everything has been set up for the secondary
- * CPUs - they just need to reload everything
- * from the task structure
- * This function must not return.
- */
-void __init initialize_secondary(void)
-{
-	/*
-	 * We don't actually need to load the full TSS,
-	 * basically just the stack pointer and the eip.
-	 */
-
-	asm volatile(
-		"movl %0,%%esp\n\t"
-		"jmp *%1"
-		:
-		:"r" (current->thread.esp),"r" (current->thread.eip));
-}
-
-extern struct {
-	void * esp;
-	unsigned short ss;
-} stack_start;
-
-static int __init fork_by_hand(void)
-{
-	struct pt_regs regs;
-	/* don't care about the eip and regs settings since we'll never
-	   reschedule the forked task. */
-	return do_fork(CLONE_VM|CLONE_PID, 0, &regs);
-}
-
-static void __init do_boot_cpu(int i)
-{
-	unsigned long cfg;
-	pgd_t maincfg;
-	struct task_struct *idle;
-	unsigned long send_status, accept_status;
-	int timeout, num_starts, j;
-	unsigned long start_eip;
-
-	cpucount++;
-	/* We can't use kernel_thread since we must _avoid_ to reschedule
-	   the child. */
-	if (fork_by_hand() < 0)
-		panic("failed fork for CPU %d", i);
-
-	/*
-	 * We remove it from the pidhash and the runqueue
-	 * once we got the process:
-	 */
-	idle = init_task.prev_task;
-	if (!idle)
-		panic("No idle process for CPU %d", i);
-
-	idle->processor = i;
-	__cpu_logical_map[cpucount] = i;
-	cpu_number_map[i] = cpucount;
-	idle->has_cpu = 1; /* we schedule the first task manually */
-	idle->thread.eip = (unsigned long) start_secondary;
-
-	del_from_runqueue(idle);
-	unhash_process(idle);
-	init_tasks[cpucount] = idle;
-
-	/* start_eip had better be page-aligned! */
-	start_eip = setup_trampoline();
-
-	printk("Booting processor %d eip %lx\n", i, start_eip);	/* So we see what's up   */
-	stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
-
-	/*
-	 *	This grunge runs the startup process for
-	 *	the targeted processor.
-	 */
-
-	SMP_PRINTK(("Setting warm reset code and vector.\n"));
-
-	CMOS_WRITE(0xa, 0xf);
-	local_flush_tlb();
-	SMP_PRINTK(("1.\n"));
-	*((volatile unsigned short *) phys_to_virt(0x469)) = start_eip >> 4;
-	SMP_PRINTK(("2.\n"));
-	*((volatile unsigned short *) phys_to_virt(0x467)) = start_eip & 0xf;
-	SMP_PRINTK(("3.\n"));
-
-	maincfg=swapper_pg_dir[0];
-	((unsigned long *)swapper_pg_dir)[0]=0x102007;
-
-	/*
-	 *	Be paranoid about clearing APIC errors.
-	 */
-
-	if ( apic_version[i] & 0xF0 )
-	{
-		apic_write(APIC_ESR, 0);
-		accept_status = (apic_read(APIC_ESR) & 0xEF);
-	}
-
-	/*
-	 *	Status is now clean
-	 */
-	
-	send_status = 	0;
-	accept_status = 0;
-
-	/*
-	 *	Starting actual IPI sequence...
-	 */
-
-	SMP_PRINTK(("Asserting INIT.\n"));
-
-	/*
-	 *	Turn INIT on
-	 */
-			
-	cfg=apic_read(APIC_ICR2);
-	cfg&=0x00FFFFFF;
-	apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); 			/* Target chip     	*/
-	cfg=apic_read(APIC_ICR);
-	cfg&=~0xCDFFF;								/* Clear bits 		*/
-	cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_ASSERT | APIC_DEST_DM_INIT);
-	apic_write(APIC_ICR, cfg);						/* Send IPI */
-
-	udelay(200);
-	SMP_PRINTK(("Deasserting INIT.\n"));
-
-	cfg=apic_read(APIC_ICR2);
-	cfg&=0x00FFFFFF;
-	apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i));			/* Target chip     	*/
-	cfg=apic_read(APIC_ICR);
-	cfg&=~0xCDFFF;								/* Clear bits 		*/
-	cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_DM_INIT);
-	apic_write(APIC_ICR, cfg);						/* Send IPI */
-
-	/*
-	 *	Should we send STARTUP IPIs ?
-	 *
-	 *	Determine this based on the APIC version.
-	 *	If we don't have an integrated APIC, don't
-	 *	send the STARTUP IPIs.
-	 */
-
-	if ( apic_version[i] & 0xF0 )
-		num_starts = 2;
-	else
-		num_starts = 0;
-
-	/*
-	 *	Run STARTUP IPI loop.
-	 */
-
-	for (j = 1; !(send_status || accept_status)
-		    && (j <= num_starts) ; j++)
-	{
-		SMP_PRINTK(("Sending STARTUP #%d.\n",j));
-		apic_write(APIC_ESR, 0);
-		SMP_PRINTK(("After apic_write.\n"));
-
-		/*
-		 *	STARTUP IPI
-		 */
-
-		cfg=apic_read(APIC_ICR2);
-		cfg&=0x00FFFFFF;
-		apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i));			/* Target chip     	*/
-		cfg=apic_read(APIC_ICR);
-		cfg&=~0xCDFFF;								/* Clear bits 		*/
-		cfg |= (APIC_DEST_DM_STARTUP | (start_eip >> 12));						/* Boot on the stack 	*/
-		SMP_PRINTK(("Before start apic_write.\n"));
-		apic_write(APIC_ICR, cfg);						/* Kick the second 	*/
-
-		SMP_PRINTK(("Startup point 1.\n"));
-
-		timeout = 0;
-		SMP_PRINTK(("Waiting for send to finish...\n"));
-		do {
-			SMP_PRINTK(("+"));
-			udelay(100);
-			send_status = apic_read(APIC_ICR) & 0x1000;
-		} while (send_status && (timeout++ < 1000));
-
-		/*
-		 * Give the other CPU some time to accept the IPI.
-		 */
-		udelay(200);
-		accept_status = (apic_read(APIC_ESR) & 0xEF);
-	}
-	SMP_PRINTK(("After Startup.\n"));
-
-	if (send_status)		/* APIC never delivered?? */
-		printk("APIC never delivered???\n");
-	if (accept_status)		/* Send accept error */
-		printk("APIC delivery error (%lx).\n", accept_status);
-
-	if ( !(send_status || accept_status) )
-	{
-		/*
-		 * allow APs to start initializing.
-		 */
-		SMP_PRINTK(("Before Callout %d.\n", i));
-		set_bit(i, (unsigned long *)&cpu_callout_map[0]);
-		SMP_PRINTK(("After Callout %d.\n", i));
-
-		for(timeout=0;timeout<50000;timeout++)
-		{
-			if (cpu_callin_map[0]&(1<<i))
-				break;				/* It has booted */
-			udelay(100);				/* Wait 5s total for a response */
-		}
-		if (cpu_callin_map[0]&(1<<i))
-		{
-			/* number CPUs logically, starting from 1 (BSP is 0) */
-#if 0
-			cpu_number_map[i] = cpucount;
-			__cpu_logical_map[cpucount] = i;
-#endif
-			printk("OK.\n");
-			printk("CPU%d: ", i);
-			print_cpu_info(&cpu_data[i]);
-		}
-		else
-		{
-			if (*((volatile unsigned char *)phys_to_virt(8192))==0xA5)
-				printk("Stuck ??\n");
-			else
-				printk("Not responding.\n");
-		}
-	SMP_PRINTK(("CPU has booted.\n"));
-	}
-	else
-	{
-		__cpu_logical_map[cpucount] = -1;
-		cpu_number_map[i] = -1;
-		cpucount--;
-	}
-
-	swapper_pg_dir[0]=maincfg;
-	local_flush_tlb();
-
-	/* mark "stuck" area as not stuck */
-	*((volatile unsigned long *)phys_to_virt(8192)) = 0;
-}
-
-cycles_t cacheflush_time;
-extern unsigned long cpu_hz;
-
-static void smp_tune_scheduling (void)
-{
-	unsigned long cachesize;
-	/*
-	 * Rough estimation for SMP scheduling, this is the number of
-	 * cycles it takes for a fully memory-limited process to flush
-	 * the SMP-local cache.
-	 *
-	 * (For a P5 this pretty much means we will choose another idle
-	 *  CPU almost always at wakeup time (this is due to the small
-	 *  L1 cache), on PIIs it's around 50-100 usecs, depending on
-	 *  the cache size)
-	 */
-
-	if (!cpu_hz) {
-		/*
-		 * this basically disables processor-affinity
-		 * scheduling on SMP without a TSC.
-		 */
-		cacheflush_time = 0;
-		return;
-	} else {
-		cachesize = boot_cpu_data.x86_cache_size;
-		if (cachesize == -1)
-			cachesize = 8; /* Pentiums */
+ *	This code is released under the GNU public license version 2 or
+ *	later.
+ */
 
-		cacheflush_time = cpu_hz/1024*cachesize/5000;
-	}
+#include <linux/config.h>
+#include <linux/init.h>
 
-	printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
-		(long)cacheflush_time/(cpu_hz/1000000),
-		((long)cacheflush_time*100/(cpu_hz/1000000)) % 100);
-}
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+#include <linux/irq.h>
 
-unsigned int prof_multiplier[NR_CPUS];
-unsigned int prof_old_multiplier[NR_CPUS];
-unsigned int prof_counter[NR_CPUS];
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/mtrr.h>
 
 /*
- *	Cycle through the processors sending APIC IPIs to boot each.
+ *	Some notes on processor bugs:
+ *
+ *	Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ *	The Linux implications for SMP are handled as follows:
+ *
+ *	Pentium III / [Xeon]
+ *		None of the E1AP-E3AP erratas are visible to the user.
+ *
+ *	E1AP.	see PII A1AP
+ *	E2AP.	see PII A2AP
+ *	E3AP.	see PII A3AP
+ *
+ *	Pentium II / [Xeon]
+ *		None of the A1AP-A3AP erratas are visible to the user.
+ *
+ *	A1AP.	see PPro 1AP
+ *	A2AP.	see PPro 2AP
+ *	A3AP.	see PPro 7AP
+ *
+ *	Pentium Pro
+ *		None of 1AP-9AP erratas are visible to the normal user,
+ *	except occasional delivery of 'spurious interrupt' as trap #15.
+ *	This is very rare and a non-problem.
+ *
+ *	1AP.	Linux maps APIC as non-cacheable
+ *	2AP.	worked around in hardware
+ *	3AP.	fixed in C0 and above steppings microcode update.
+ *		Linux does not use excessive STARTUP_IPIs.
+ *	4AP.	worked around in hardware
+ *	5AP.	symmetric IO mode (normal Linux operation) not affected.
+ *		'noapic' mode has vector 0xf filled out properly.
+ *	6AP.	'noapic' mode might be affected - fixed in later steppings
+ *	7AP.	We do not assume writes to the LVT deassering IRQs
+ *	8AP.	We do not enable low power mode (deep sleep) during MP bootup
+ *	9AP.	We do not use mixed mode
+ *
+ *	Pentium
+ *		There is a marginal case where REP MOVS on 100MHz SMP
+ *	machines with B stepping processors can fail. XXX should provide
+ *	an L1cache=Writethrough or L1cache=off option.
+ *
+ *		B stepping CPUs may hang. There are hardware work arounds
+ *	for this. We warn about it in case your board doesnt have the work
+ *	arounds. Basically thats so I can tell anyone with a B stepping
+ *	CPU and SMP problems "tough".
+ *
+ *	Specific items [From Pentium Processor Specification Update]
+ *
+ *	1AP.	Linux doesn't use remote read
+ *	2AP.	Linux doesn't trust APIC errors
+ *	3AP.	We work around this
+ *	4AP.	Linux never generated 3 interrupts of the same priority
+ *		to cause a lost local interrupt.
+ *	5AP.	Remote read is never used
+ *	6AP.	not affected - worked around in hardware
+ *	7AP.	not affected - worked around in hardware
+ *	8AP.	worked around in hardware - we get explicit CS errors if not
+ *	9AP.	only 'noapic' mode affected. Might generate spurious
+ *		interrupts, we log only the first one and count the
+ *		rest silently.
+ *	10AP.	not affected - worked around in hardware
+ *	11AP.	Linux reads the APIC between writes to avoid this, as per
+ *		the documentation. Make sure you preserve this as it affects
+ *		the C stepping chips too.
+ *	12AP.	not affected - worked around in hardware
+ *	13AP.	not affected - worked around in hardware
+ *	14AP.	we always deassert INIT during bootup
+ *	15AP.	not affected - worked around in hardware
+ *	16AP.	not affected - worked around in hardware
+ *	17AP.	not affected - worked around in hardware
+ *	18AP.	not affected - worked around in hardware
+ *	19AP.	not affected - worked around in BIOS
+ *
+ *	If this sounds worrying believe me these bugs are either ___RARE___,
+ *	or are signal timing bugs worked around in hardware and there's
+ *	about nothing of note with C stepping upwards.
  */
 
-void __init smp_boot_cpus(void)
-{
-	int i;
-
-#ifdef CONFIG_MTRR
-	/*  Must be done before other processors booted  */
-	mtrr_init_boot_cpu ();
-#endif
-	/*
-	 *	Initialize the logical to physical CPU number mapping
-	 *	and the per-CPU profiling counter/multiplier
-	 */
-
-	for (i = 0; i < NR_CPUS; i++) {
-		cpu_number_map[i] = -1;
-		prof_counter[i] = 1;
-		prof_old_multiplier[i] = 1;
-		prof_multiplier[i] = 1;
-	}
-
-	/*
-	 *	Setup boot CPU information
-	 */
-
-	smp_store_cpu_info(boot_cpu_id);			/* Final full version of the data */
-	smp_tune_scheduling();
-	printk("CPU%d: ", boot_cpu_id);
-	print_cpu_info(&cpu_data[boot_cpu_id]);
-
-	/*
-	 * not necessary because the MP table should list the boot
-	 * CPU too, but we do it for the sake of robustness anyway.
-	 * (and for the case when a non-SMP board boots an SMP kernel)
-	 */
-	cpu_present_map |= (1 << hard_smp_processor_id());
-
-	cpu_number_map[boot_cpu_id] = 0;
-
-	init_idle();
-
-	/*
-	 * If we couldnt find an SMP configuration at boot time,
-	 * get out of here now!
-	 */
-
-	if (!smp_found_config)
-	{
-		printk(KERN_NOTICE "SMP motherboard not detected. Using dummy APIC emulation.\n");
-#ifndef CONFIG_VISWS
-		io_apic_irqs = 0;
-#endif
-		cpu_online_map = cpu_present_map;
-		smp_num_cpus = 1;
-		goto smp_done;
-	}
-
-	/*
-	 *	If SMP should be disabled, then really disable it!
-	 */
-
-	if (!max_cpus)
-	{
-		smp_found_config = 0;
-		printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
-	}
-
-#ifdef SMP_DEBUG
-	{
-		int reg;
-
-		/*
-		 *	This is to verify that we're looking at
-		 *	a real local APIC.  Check these against
-		 *	your board if the CPUs aren't getting
-		 *	started for no apparent reason.
-		 */
-
-		reg = apic_read(APIC_VERSION);
-		SMP_PRINTK(("Getting VERSION: %x\n", reg));
-
-		apic_write(APIC_VERSION, 0);
-		reg = apic_read(APIC_VERSION);
-		SMP_PRINTK(("Getting VERSION: %x\n", reg));
-
-		/*
-		 *	The two version reads above should print the same
-		 *	NON-ZERO!!! numbers.  If the second one is zero,
-		 *	there is a problem with the APIC write/read
-		 *	definitions.
-		 *
-		 *	The next two are just to see if we have sane values.
-		 *	They're only really relevant if we're in Virtual Wire
-		 *	compatibility mode, but most boxes are anymore.
-		 */
-
-
-		reg = apic_read(APIC_LVT0);
-		SMP_PRINTK(("Getting LVT0: %x\n", reg));
-
-		reg = apic_read(APIC_LVT1);
-		SMP_PRINTK(("Getting LVT1: %x\n", reg));
-	}
-#endif
-
-	enable_local_APIC();
-
-	/*
-	 * Set up our local APIC timer:
-	 */
-	setup_APIC_clock ();
-
-	/*
-	 *	Now scan the CPU present map and fire up the other CPUs.
-	 */
-
-	/*
-	 * Add all detected CPUs. (later on we can down individual
-	 * CPUs which will change cpu_online_map but not necessarily
-	 * cpu_present_map. We are pretty much ready for hot-swap CPUs.)
-	 */
-	cpu_online_map = cpu_present_map;
-	mb();
-
-	SMP_PRINTK(("CPU map: %lx\n", cpu_present_map));
-
-	for(i=0;i<NR_CPUS;i++)
-	{
-		/*
-		 *	Don't even attempt to start the boot CPU!
-		 */
-		if (i == boot_cpu_id)
-			continue;
-
-		if ((cpu_online_map & (1 << i))
-		    && (max_cpus < 0 || max_cpus > cpucount+1))
-		{
-			do_boot_cpu(i);
-		}
-
-		/*
-		 *	Make sure we unmap all failed CPUs
-		 */
-		
-		if (cpu_number_map[i] == -1 && (cpu_online_map & (1 << i))) {
-			printk("CPU #%d not responding. Removing from cpu_online_map.\n",i);
-			cpu_online_map &= ~(1 << i);
-                }
-        }
-
-	/*
-	 *	Cleanup possible dangling ends...
-	 */
-
-#ifndef CONFIG_VISWS
-	{
-		unsigned long cfg;
-
-		/*
-		 *	Install writable page 0 entry.
-		 */
-		cfg = pg0[0];
-		pg0[0] = _PAGE_RW | _PAGE_PRESENT;	/* writeable, present, addr 0 */
-		local_flush_tlb();
-	
-		/*
-		 *	Paranoid:  Set warm reset code and vector here back
-		 *	to default values.
-		 */
-
-		CMOS_WRITE(0, 0xf);
-
-		*((volatile long *) phys_to_virt(0x467)) = 0;
-
-		/*
-		 *	Restore old page 0 entry.
-		 */
-
-		pg0[0] = cfg;
-		local_flush_tlb();
-	}
-#endif
-
-	/*
-	 *	Allow the user to impress friends.
-	 */
-
-	SMP_PRINTK(("Before bogomips.\n"));
-	if (!cpucount) {
-		printk(KERN_ERR "Error: only one processor found.\n");
-		cpu_online_map = (1<<hard_smp_processor_id());
-	} else {
-		unsigned long bogosum = 0;
-		for(i = 0; i < 32; i++)
-			if (cpu_online_map&(1<<i))
-				bogosum+=cpu_data[i].loops_per_sec;
-		printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
-			cpucount+1,
-			(bogosum+2500)/500000,
-			((bogosum+2500)/5000)%100);
-		SMP_PRINTK(("Before bogocount - setting activated=1.\n"));
-		smp_activated = 1;
-	}
-	smp_num_cpus = cpucount + 1;
-
-	if (smp_b_stepping)
-		printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
-	SMP_PRINTK(("Boot done.\n"));
-
-	cache_APIC_registers();
-#ifndef CONFIG_VISWS
-	/*
-	 * Here we can be sure that there is an IO-APIC in the system. Let's
-	 * go and set it up:
-	 */
-	if (!skip_ioapic_setup) 
-		setup_IO_APIC();
-#endif
-
-smp_done:
-	/*
-	 * now we know the other CPUs have fired off and we know our
-	 * APIC ID, so we can go init the TSS and stuff:
-	 */
-	cpu_init();
-}
+/* The 'big kernel lock' */
+spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
 
+volatile unsigned long smp_invalidate_needed;
 
 /*
  * the following functions deal with sending IPIs between CPUs.
@@ -1429,17 +110,6 @@
  * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
  */
 
-
-/*
- * Silly serialization to work around CPU bug in P5s.
- * We can safely turn it off on a 686.
- */
-#ifdef CONFIG_X86_GOOD_APIC
-# define FORCE_APIC_SERIALIZATION 0
-#else
-# define FORCE_APIC_SERIALIZATION 1
-#endif
-
 static unsigned int cached_APIC_ICR;
 static unsigned int cached_APIC_ICR2;
 
@@ -1462,7 +132,7 @@
 
 static inline unsigned int __get_ICR (void)
 {
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
 	/*
 	 * Wait for the APIC to become ready - this should never occur. It's
 	 * a debugging check really.
@@ -1473,11 +143,11 @@
 	while (count < 1000)
 	{
 		cfg = slow_ICR;
-		if (!(cfg&(1<<12))) {
-			if (count)
-				atomic_add(count, (atomic_t*)&ipi_count);
+		if (!(cfg&(1<<12)))
 			return cfg;
-		}
+		printk("CPU #%d: ICR still busy [%08x]\n",
+					smp_processor_id(), cfg);
+		irq_err_count++;
 		count++;
 		udelay(10);
 	}
@@ -1491,19 +161,25 @@
 
 static inline unsigned int __get_ICR2 (void)
 {
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
 	return slow_ICR2;
 #else
 	return cached_APIC_ICR2;
 #endif
 }
 
+#define LOGICAL_DELIVERY 1
+
 static inline int __prepare_ICR (unsigned int shortcut, int vector)
 {
 	unsigned int cfg;
 
 	cfg = __get_ICR();
-	cfg |= APIC_DEST_DM_FIXED|shortcut|vector;
+	cfg |= APIC_DEST_DM_FIXED|shortcut|vector
+#if LOGICAL_DELIVERY
+		|APIC_DEST_LOGICAL
+#endif
+		;
 
 	return cfg;
 }
@@ -1513,7 +189,11 @@
 	unsigned int cfg;
 
 	cfg = __get_ICR2();
+#if LOGICAL_DELIVERY
+	cfg |= SET_APIC_DEST_FIELD((1<<dest));
+#else
 	cfg |= SET_APIC_DEST_FIELD(dest);
+#endif
 
 	return cfg;
 }
@@ -1526,7 +206,7 @@
  * have to lock out interrupts to be safe. Otherwise it's just one
  * single atomic write to the APIC, no need for cli/sti.
  */
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
 	unsigned long flags;
 
 	__save_flags(flags);
@@ -1536,21 +216,26 @@
 	/*
 	 * No need to touch the target chip field
 	 */
-
 	cfg = __prepare_ICR(shortcut, vector);
 
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
 	apic_write(APIC_ICR, cfg);
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
 	__restore_flags(flags);
 #endif
 }
 
 static inline void send_IPI_allbutself(int vector)
 {
-	__send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+	/*
+	 * if there are no other CPUs in the system then
+	 * we get an APIC send error if we try to broadcast.
+	 * thus we have to avoid sending IPIs in this case.
+	 */
+	if (smp_num_cpus > 1)
+		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
 }
 
 static inline void send_IPI_all(int vector)
@@ -1566,7 +251,7 @@
 static inline void send_IPI_single(int dest, int vector)
 {
 	unsigned long cfg;
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
 	unsigned long flags;
 
 	__save_flags(flags);
@@ -1589,7 +274,7 @@
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
 	apic_write(APIC_ICR, cfg);
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
 	__restore_flags(flags);
 #endif
 }
@@ -1715,200 +400,97 @@
 }
 
 /*
- * this function sends a 'stop' IPI to all other CPUs in the system.
- * it goes straight through.
- */
-
-void smp_send_stop(void)
-{
-	send_IPI_allbutself(STOP_CPU_VECTOR);
-}
-
-/* Structure and data for smp_call_function(). This is designed to minimise
+ * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-struct smp_call_function_struct {
+static volatile struct call_data_struct {
 	void (*func) (void *info);
 	void *info;
-	atomic_t unstarted_count;
-	atomic_t unfinished_count;
+	atomic_t started;
+	atomic_t finished;
 	int wait;
-};
-static volatile struct smp_call_function_struct *smp_call_function_data = NULL;
+} *call_data = NULL;
 
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
  * in the system.
  */
 
-int smp_call_function (void (*func) (void *info), void *info, int retry,
-		       int wait)
-/*  [SUMMARY] Run a function on all other CPUs.
-    <func> The function to run. This must be fast and non-blocking.
-    <info> An arbitrary pointer to pass to the function.
-    <retry> If true, keep retrying until ready.
-    <wait> If true, wait until function has completed on other CPUs.
-    [RETURNS] 0 on success, else a negative status code. Does not return until
-    remote CPUs are nearly ready to execute <<func>> or are or have executed.
-*/
-{
-	unsigned long timeout;
-	struct smp_call_function_struct data;
-	static spinlock_t lock = SPIN_LOCK_UNLOCKED;
-
-	if (retry) {
-		while (1) {
-			if (smp_call_function_data) {
-				schedule ();  /*  Give a mate a go  */
-				continue;
-			}
-			spin_lock (&lock);
-			if (smp_call_function_data) {
-				spin_unlock (&lock);  /*  Bad luck  */
-				continue;
-			}
-			/*  Mine, all mine!  */
-			break;
-		}
-	}
-	else {
-		if (smp_call_function_data) return -EBUSY;
-		spin_lock (&lock);
-		if (smp_call_function_data) {
-			spin_unlock (&lock);
-			return -EBUSY;
-		}
-	}
-	smp_call_function_data = &data;
-	spin_unlock (&lock);
-	data.func = func;
-	data.info = info;
-	atomic_set (&data.unstarted_count, smp_num_cpus - 1);
-	data.wait = wait;
-	if (wait) atomic_set (&data.unfinished_count, smp_num_cpus - 1);
-	/*  Send a message to all other CPUs and wait for them to respond  */
-	send_IPI_allbutself (CALL_FUNCTION_VECTOR);
-	/*  Wait for response  */
-	timeout = jiffies + JIFFIE_TIMEOUT;
-	while ( (atomic_read (&data.unstarted_count) > 0) &&
-		time_before (jiffies, timeout) )
-		barrier ();
-	if (atomic_read (&data.unstarted_count) > 0) {
-		smp_call_function_data = NULL;
-		return -ETIMEDOUT;
-	}
-	if (wait)
-		while (atomic_read (&data.unfinished_count) > 0)
-			barrier ();
-	smp_call_function_data = NULL;
-	return 0;
-}
-
-static unsigned int calibration_result;
-
-void setup_APIC_timer(unsigned int clocks);
-
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			int wait)
 /*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
+ * [SUMMARY] Run a function on all other CPUs.
+ * <func> The function to run. This must be fast and non-blocking.
+ * <info> An arbitrary pointer to pass to the function.
+ * <nonatomic> If true, we might schedule away to lock the mutex
+ * <wait> If true, wait (atomically) until function has completed on other CPUs.
+ * [RETURNS] 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
  */
-
-void smp_local_timer_interrupt(struct pt_regs * regs)
 {
-	int user = (user_mode(regs) != 0);
-	int cpu = smp_processor_id();
-
-	/*
-	 * The profiling function is SMP safe. (nothing can mess
-	 * around with "current", and the profiling counters are
-	 * updated with atomic operations). This is especially
-	 * useful with a profiling multiplier != 1
-	 */
-	if (!user)
-		x86_do_profile(regs->eip);
+	struct call_data_struct data;
+	int ret, cpus = smp_num_cpus-1;
+	static DECLARE_MUTEX(lock);
+	unsigned long timeout;
 
-	if (!--prof_counter[cpu]) {
-		int system = 1 - user;
-		struct task_struct * p = current;
+	if (nonatomic)
+		down(&lock);
+	else
+		if (down_trylock(&lock))
+			return -EBUSY;
 
-		/*
-		 * The multiplier may have changed since the last time we got
-		 * to this point as a result of the user writing to
-		 * /proc/profile.  In this case we need to adjust the APIC
-		 * timer accordingly.
-		 *
-		 * Interrupts are already masked off at this point.
-		 */
-               prof_counter[cpu] = prof_multiplier[cpu];
-               if (prof_counter[cpu] != prof_old_multiplier[cpu]) {
-                       setup_APIC_timer(calibration_result/prof_counter[cpu]);
-                       prof_old_multiplier[cpu] = prof_counter[cpu];
-               }
+	if (call_data) // temporary debugging check
+		BUG();
 
-		/*
-		 * After doing the above, we need to make like
-		 * a normal interrupt - otherwise timer interrupts
-		 * ignore the global interrupt lock, which is the
-		 * WrongThing (tm) to do.
-		 */
+	call_data = &data;
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+	mb();
 
- 		irq_enter(cpu, 0);
-		update_one_process(p, 1, user, system, cpu);
-		if (p->pid) {
-			p->counter -= 1;
-			if (p->counter <= 0) {
-				p->counter = 0;
-				p->need_resched = 1;
-			}
-			if (p->priority < DEF_PRIORITY) {
-				kstat.cpu_nice += user;
-				kstat.per_cpu_nice[cpu] += user;
-			} else {
-				kstat.cpu_user += user;
-				kstat.per_cpu_user[cpu] += user;
-			}
-			kstat.cpu_system += system;
-			kstat.per_cpu_system[cpu] += system;
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
 
-		}
-		irq_exit(cpu, 0);
-	}
+	/* Wait for response */
+	timeout = jiffies + HZ;
+	while ((atomic_read(&data.started) != cpus)
+			&& time_before(jiffies, timeout))
+		barrier();
+	ret = -ETIMEDOUT;
+	if (atomic_read(&data.started) != cpus)
+		goto out;
+	ret = 0;
+	if (wait)
+		while (atomic_read(&data.finished) != cpus)
+			barrier();
+out:
+	call_data = NULL;
+	up(&lock);
+	return 0;
+}
 
+static void stop_this_cpu (void * dummy)
+{
 	/*
-	 * We take the 'long' return path, and there every subsystem
-	 * grabs the apropriate locks (kernel lock/ irq lock).
-	 *
-	 * we might want to decouple profiling from the 'long path',
-	 * and do the profiling totally in assembly.
-	 *
-	 * Currently this isn't too much of an issue (performance wise),
-	 * we can take more than 100K local irqs per second on a 100 MHz P5.
+	 * Remove this CPU:
 	 */
+	clear_bit(smp_processor_id(), &cpu_online_map);
+
+	if (cpu_data[smp_processor_id()].hlt_works_ok)
+		for(;;) __asm__("hlt");
+	for (;;);
 }
 
 /*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesnt support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- *   interrupt as well. Thus we cannot inline the local irq ... ]
+ * this function calls the 'stop' function on all other CPUs in the system.
  */
-void smp_apic_timer_interrupt(struct pt_regs * regs)
+
+void smp_send_stop(void)
 {
-	/*
-	 * NOTE! We'd better ACK the irq immediately,
-	 * because timer handling can be slow, and we
-	 * want to be able to accept NMI tlb invalidates
-	 * during this time.
-	 */
-	ack_APIC_irq();
-	smp_local_timer_interrupt(regs);
+        smp_call_function(stop_this_cpu, NULL, 1, 0);
 }
 
 /*
@@ -1944,39 +526,24 @@
 
 }
 
-static void stop_this_cpu (void)
+asmlinkage void smp_call_function_interrupt(void)
 {
+	void (*func) (void *info) = call_data->func;
+	void *info = call_data->info;
+	int wait = call_data->wait;
+
+	ack_APIC_irq();
 	/*
-	 * Remove this CPU:
+	 * Notify initiating CPU that I've grabbed the data and am
+	 * about to execute the function
 	 */
-	clear_bit(smp_processor_id(), &cpu_online_map);
-
-	if (cpu_data[smp_processor_id()].hlt_works_ok)
-		for(;;) __asm__("hlt");
-	for (;;);
-}
-
-/*
- *	CPU halt call-back
- */
-asmlinkage void smp_stop_cpu_interrupt(void)
-{
-	stop_this_cpu();
-}
-
-asmlinkage void smp_call_function_interrupt(void)
-{
-	void (*func) (void *info) = smp_call_function_data->func;
-	void *info = smp_call_function_data->info;
-	int wait = smp_call_function_data->wait;
-
-	ack_APIC_irq ();
-	/*  Notify initiating CPU that I've grabbed the data and am about to
-	    execute the function  */
-	atomic_dec (&smp_call_function_data->unstarted_count);
-	/*  At this point the structure may be out of scope unless wait==1  */
-	(*func) (info);
-	if (wait) atomic_dec (&smp_call_function_data->unfinished_count);
+	atomic_inc(&call_data->started);
+	/*
+	 * At this point the structure may be out of scope unless wait==1
+	 */
+	(*func)(info);
+	if (wait)
+		atomic_inc(&call_data->finished);
 }
 
 /*
@@ -1991,6 +558,34 @@
 }
 
 /*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+
+static spinlock_t err_lock;
+
+asmlinkage void smp_error_interrupt(void)
+{
+	unsigned long v;
+
+	spin_lock(&err_lock);
+
+	v = apic_read(APIC_ESR);
+	printk("APIC error interrupt on CPU#%d, should never happen.\n",
+			smp_processor_id());
+	printk("... APIC ESR0: %08lx\n", v);
+
+	apic_write(APIC_ESR, 0);
+	v = apic_read(APIC_ESR);
+	printk("... APIC ESR1: %08lx\n", v);
+
+	ack_APIC_irq();
+
+	irq_err_count++;
+
+	spin_unlock(&err_lock);
+}
+
+/*
  * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts
  * per second. We assume that the caller has already set up the local
  * APIC.
@@ -1999,6 +594,10 @@
  * closely follows bus clocks.
  */
 
+int prof_multiplier[NR_CPUS] = { 1, };
+int prof_old_multiplier[NR_CPUS] = { 1, };
+int prof_counter[NR_CPUS] = { 1, };
+
 /*
  * The timer chip is already set up at HZ interrupts per second here,
  * but we do not accept timer interrupts yet. We only allow the BP
@@ -2015,66 +614,102 @@
 	return count;
 }
 
+void __init wait_8254_wraparound(void)
+{
+	unsigned int curr_count, prev_count=~0;
+	int delta;
+
+	curr_count = get_8254_timer_count();
+
+	do {
+		prev_count = curr_count;
+		curr_count = get_8254_timer_count();
+		delta = curr_count-prev_count;
+
+	/*
+	 * This limit for delta seems arbitrary, but it isn't, it's
+	 * slightly above the level of error a buggy Mercury/Neptune
+	 * chipset timer can cause.
+	 */
+
+	} while (delta<300);
+}
+
 /*
  * This function sets up the local APIC timer, with a timeout of
  * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice, once with a bogus timeout value, second
- * time for real. The other (noncalibrating) CPUs call this
- * function only once, with the real value.
- *
- * We are strictly in irqs off mode here, as we do not want to
- * get an APIC interrupt go off accidentally.
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
  *
  * We do reads before writes even if unnecessary, to get around the
- * APIC double write bug.
+ * P5 APIC double write bug.
  */
 
 #define APIC_DIVISOR 16
 
-void setup_APIC_timer(unsigned int clocks)
+void __setup_APIC_LVTT(unsigned int clocks)
 {
-	unsigned long lvtt1_value;
-	unsigned int tmp_value;
+	unsigned int lvtt1_value, tmp_value;
 
-	/*
-	 * Unfortunately the local APIC timer cannot be set up into NMI
-	 * mode. With the IO APIC we can re-route the external timer
-	 * interrupt and broadcast it as an NMI to all CPUs, so no pain.
-	 */
 	tmp_value = apic_read(APIC_LVTT);
-	lvtt1_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
-	apic_write(APIC_LVTT , lvtt1_value);
+	lvtt1_value = SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV) |
+			APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+	apic_write(APIC_LVTT, lvtt1_value);
 
 	/*
 	 * Divide PICLK by 16
 	 */
 	tmp_value = apic_read(APIC_TDCR);
-	apic_write(APIC_TDCR , (tmp_value & ~APIC_TDR_DIV_1 )
-				 | APIC_TDR_DIV_16);
+	apic_write(APIC_TDCR, (tmp_value
+				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+				| APIC_TDR_DIV_16);
 
 	tmp_value = apic_read(APIC_TMICT);
 	apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
 }
 
-void __init wait_8254_wraparound(void)
+void setup_APIC_timer(void * data)
 {
-	unsigned int curr_count, prev_count=~0;
+	unsigned int clocks = (unsigned int) data, slice, t0, t1, nr;
+	unsigned long flags;
 	int delta;
 
-	curr_count = get_8254_timer_count();
-
-	do {
-		prev_count = curr_count;
-		curr_count = get_8254_timer_count();
-		delta = curr_count-prev_count;
+	__save_flags(flags);
+	__sti();
+	/*
+	 * ok, Intel has some smart code in their APIC that knows
+	 * if a CPU was in 'hlt' lowpower mode, and this increases
+	 * its APIC arbitration priority. To avoid the external timer
+	 * IRQ APIC event being in synchron with the APIC clock we
+	 * introduce an interrupt skew to spread out timer events.
+	 *
+	 * The number of slices within a 'big' timeslice is smp_num_cpus+1
+	 */
 
+	slice = clocks / (smp_num_cpus+1);
+	nr = cpu_number_map[smp_processor_id()] + 1;
+	printk("cpu: %d, clocks: %d, slice: %d, nr: %d.\n",
+		smp_processor_id(), clocks, slice, nr);
 	/*
-	 * This limit for delta seems arbitrary, but it isn't, it's
-	 * slightly above the level of error a buggy Mercury/Neptune
-	 * chipset timer can cause.
+	 * Wait for IRQ0's slice:
 	 */
+	wait_8254_wraparound();
 
-	} while (delta<300);
+	__setup_APIC_LVTT(clocks);
+
+	t0 = apic_read(APIC_TMCCT)*APIC_DIVISOR;
+	do {
+		t1 = apic_read(APIC_TMCCT)*APIC_DIVISOR;
+		delta = (int)(t0 - t1 - slice*nr);
+	} while (delta < 0);
+
+	__setup_APIC_LVTT(clocks);
+
+	printk("CPU%d<C0:%d,C:%d,D:%d,S:%d,C:%d>\n",
+			smp_processor_id(), t0, t1, delta, slice, clocks);
+
+	__restore_flags(flags);
 }
 
 /*
@@ -2092,10 +727,11 @@
 
 int __init calibrate_APIC_clock(void)
 {
-	unsigned long long t1,t2;
-	long tt1,tt2;
-	long calibration_result;
+	unsigned long long t1 = 0, t2 = 0;
+	long tt1, tt2;
+	long result;
 	int i;
+	const int LOOPS = HZ/10;
 
 	printk("calibrating APIC timer ... ");
 
@@ -2104,7 +740,7 @@
 	 * value into the APIC clock, we just want to get the
 	 * counter running for calibration.
 	 */
-	setup_APIC_timer(1000000000);
+	__setup_APIC_LVTT(1000000000);
 
 	/*
 	 * The timer chip counts down to zero. Let's wait
@@ -2112,23 +748,24 @@
 	 * (the current tick might have been already half done)
 	 */
 
-	wait_8254_wraparound ();
+	wait_8254_wraparound();
 
 	/*
 	 * We wrapped around just now. Let's start:
 	 */
-	rdtscll(t1);
-	tt1=apic_read(APIC_TMCCT);
+	if (cpu_has_tsc)
+		rdtscll(t1);
+	tt1 = apic_read(APIC_TMCCT);
 
-#define LOOPS (HZ/10)
 	/*
 	 * Let's wait LOOPS wraprounds:
 	 */
-	for (i=0; i<LOOPS; i++)
-		wait_8254_wraparound ();
+	for (i = 0; i < LOOPS; i++)
+		wait_8254_wraparound();
 
-	tt2=apic_read(APIC_TMCCT);
-	rdtscll(t2);
+	tt2 = apic_read(APIC_TMCCT);
+	if (cpu_has_tsc)
+		rdtscll(t2);
 
 	/*
 	 * The APIC bus clock counter is 32 bits only, it
@@ -2138,71 +775,37 @@
 	 * underflown to be exact, as the timer counts down ;)
 	 */
 
-	calibration_result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
-
-	SMP_PRINTK(("\n..... %ld CPU clocks in 1 timer chip tick.",
-			 (unsigned long)(t2-t1)/LOOPS));
+	result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
 
-	SMP_PRINTK(("\n..... %ld APIC bus clocks in 1 timer chip tick.",
-			 calibration_result));
+	if (cpu_has_tsc)
+		printk("\n..... CPU clock speed is %ld.%04ld MHz.\n",
+			((long)(t2-t1)/LOOPS)/(1000000/HZ),
+			((long)(t2-t1)/LOOPS)%(1000000/HZ));
+
+	printk("..... host bus clock speed is %ld.%04ld MHz.\n",
+		result/(1000000/HZ),
+		result%(1000000/HZ));
 
-
-	printk("\n..... CPU clock speed is %ld.%04ld MHz.\n",
-		((long)(t2-t1)/LOOPS)/(1000000/HZ),
-		((long)(t2-t1)/LOOPS)%(1000000/HZ)  );
-
-	printk("..... system bus clock speed is %ld.%04ld MHz.\n",
-		calibration_result/(1000000/HZ),
-		calibration_result%(1000000/HZ)  );
-#undef LOOPS
-
-	return calibration_result;
+	return result;
 }
 
-void __init setup_APIC_clock(void)
+static unsigned int calibration_result;
+
+void __init setup_APIC_clocks(void)
 {
 	unsigned long flags;
 
-	static volatile int calibration_lock;
-
 	__save_flags(flags);
 	__cli();
 
-	SMP_PRINTK(("setup_APIC_clock() called.\n"));
-
-	/*
-	 * [ setup_APIC_clock() is called from all CPUs, but we want
-	 *   to do this part of the setup only once ... and it fits
-	 *   here best ]
-	 */
-	if (!test_and_set_bit(0,&calibration_lock)) {
-
-		calibration_result=calibrate_APIC_clock();
-		/*
-	 	 * Signal completion to the other CPU[s]:
-	 	 */
-		calibration_lock = 3;
-
-	} else {
-		/*
-		 * Other CPU is calibrating, wait for finish:
-		 */
-		SMP_PRINTK(("waiting for other CPU calibrating APIC ... "));
-		while (calibration_lock == 1);
-		SMP_PRINTK(("done, continuing.\n"));
-	}
-
-/*
- * Now set up the timer for real.
- */
+	calibration_result = calibrate_APIC_clock();
 
-	setup_APIC_timer (calibration_result);
+	smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1);
 
 	/*
-	 * We ACK the APIC, just in case there is something pending.
+	 * Now set up the timer for real.
 	 */
-
-	ack_APIC_irq ();
+	setup_APIC_timer((void *)calibration_result);
 
 	__restore_flags(flags);
 }
@@ -2224,9 +827,9 @@
 		return -EINVAL;
 
 	/* 
-	 * Set the new multiplier for each CPU.  CPUs don't start using the
+	 * Set the new multiplier for each CPU. CPUs don't start using the
 	 * new values until the next timer interrupt in which they do process
-	 * accounting.  At that time they also adjust their APIC timers
+	 * accounting. At that time they also adjust their APIC timers
 	 * accordingly.
 	 */
 	for (i = 0; i < NR_CPUS; ++i)
@@ -2236,4 +839,112 @@
 }
 
 #undef APIC_DIVISOR
+
+/*
+ * Local timer interrupt handler. It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+
+inline void smp_local_timer_interrupt(struct pt_regs * regs)
+{
+	int user = (user_mode(regs) != 0);
+	int cpu = smp_processor_id();
+
+	/*
+	 * The profiling function is SMP safe. (nothing can mess
+	 * around with "current", and the profiling counters are
+	 * updated with atomic operations). This is especially
+	 * useful with a profiling multiplier != 1
+	 */
+	if (!user)
+		x86_do_profile(regs->eip);
+
+	if (--prof_counter[cpu] <= 0) {
+		int system = 1 - user;
+		struct task_struct * p = current;
+
+		/*
+		 * The multiplier may have changed since the last time we got
+		 * to this point as a result of the user writing to
+		 * /proc/profile. In this case we need to adjust the APIC
+		 * timer accordingly.
+		 *
+		 * Interrupts are already masked off at this point.
+		 */
+		prof_counter[cpu] = prof_multiplier[cpu];
+		if (prof_counter[cpu] != prof_old_multiplier[cpu]) {
+			__setup_APIC_LVTT(calibration_result/prof_counter[cpu]);
+			prof_old_multiplier[cpu] = prof_counter[cpu];
+		}
+
+		/*
+		 * After doing the above, we need to make like
+		 * a normal interrupt - otherwise timer interrupts
+		 * ignore the global interrupt lock, which is the
+		 * WrongThing (tm) to do.
+		 */
+
+ 		irq_enter(cpu, 0);
+		update_one_process(p, 1, user, system, cpu);
+		if (p->pid) {
+			p->counter -= 1;
+			if (p->counter <= 0) {
+				p->counter = 0;
+				p->need_resched = 1;
+			}
+			if (p->priority < DEF_PRIORITY) {
+				kstat.cpu_nice += user;
+				kstat.per_cpu_nice[cpu] += user;
+			} else {
+				kstat.cpu_user += user;
+				kstat.per_cpu_user[cpu] += user;
+			}
+			kstat.cpu_system += system;
+			kstat.per_cpu_system[cpu] += system;
+
+		}
+		irq_exit(cpu, 0);
+	}
+
+	/*
+	 * We take the 'long' return path, and there every subsystem
+	 * grabs the apropriate locks (kernel lock/ irq lock).
+	 *
+	 * we might want to decouple profiling from the 'long path',
+	 * and do the profiling totally in assembly.
+	 *
+	 * Currently this isn't too much of an issue (performance wise),
+	 * we can take more than 100K local irqs per second on a 100 MHz P5.
+	 */
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesnt support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+unsigned int apic_timer_irqs [NR_CPUS] = { 0, };
+
+void smp_apic_timer_interrupt(struct pt_regs * regs)
+{
+	/*
+	 * the NMI deadlock-detector uses this.
+	 */
+	apic_timer_irqs[smp_processor_id()]++;
+
+	/*
+	 * NOTE! We'd better ACK the irq immediately,
+	 * because timer handling can be slow.
+	 */
+	ack_APIC_irq();
+	smp_local_timer_interrupt(regs);
+}
 

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)