diff -Naur -X dontdiff linux-2.5.69-ref/arch/i386/Kconfig linux-2.5.69/arch/i386/Kconfig --- linux-2.5.69-ref/arch/i386/Kconfig Sun May 4 18:53:02 2003 +++ linux-2.5.69/arch/i386/Kconfig Mon Jun 23 10:26:00 2003 @@ -687,6 +687,14 @@ depends on NUMA default y +config X86_MEM_HOTADD + bool "Hot-add memory support" + depends on HIGHMEM && !DISCONTIGMEM + default n + help + Selsect this if you have a system with memory hot-plug + capability. + config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" depends on HIGHMEM4G || HIGHMEM64G diff -Naur -X dontdiff linux-2.5.69-ref/arch/i386/boot/setup.S linux-2.5.69/arch/i386/boot/setup.S --- linux-2.5.69-ref/arch/i386/boot/setup.S Sun May 4 18:53:31 2003 +++ linux-2.5.69/arch/i386/boot/setup.S Mon Jun 23 10:26:00 2003 @@ -162,9 +162,13 @@ # can be located anywhere in # low memory 0x10000 or higher. +#ifndef CONFIG_X86_MEM_HOTADD ramdisk_max: .long MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd +#else +ramdisk_max: .long __MAX_RDMEM-1 +#endif trampoline: call start_of_setup .space 1024 diff -Naur -X dontdiff linux-2.5.69-ref/arch/i386/kernel/setup.c linux-2.5.69/arch/i386/kernel/setup.c --- linux-2.5.69-ref/arch/i386/kernel/setup.c Sun May 4 18:53:14 2003 +++ linux-2.5.69/arch/i386/kernel/setup.c Mon Jun 23 10:26:00 2003 @@ -74,6 +74,12 @@ /* user-defined highmem size */ static unsigned int highmem_pages = -1; +#ifdef CONFIG_X86_MEM_HOTADD +/* Virtaul address space reserved for memory hot-add operation */ +unsigned long hotadd_reserve_size; +unsigned long hotadd_reserve_start; +#endif /* CONFIG_X86_MEM_HOTADD */ + /* * Setup options */ @@ -431,6 +437,7 @@ print_memory_map(who); } /* setup_memory_region */ +static int mem_hotadd_disabled = 0; static void __init parse_cmdline_early (char ** cmdline_p) { @@ -482,6 +489,11 @@ from += 8+7; e820.nr_map = 0; userdef = 1; +#ifdef CONFIG_X86_MEM_HOTADD + } else if (!memcmp(from+4, "nohotadd", 8)) { + from += 8+4; + mem_hotadd_disabled = 1; +#endif /* CONFIG_X86_MEM_HOTADD */ } else { /* If the user specifies memory size, we * limit the BIOS-provided memory map to @@ -512,6 +524,7 @@ if (c == ' ' && !memcmp(from, "acpi=off", 8)) acpi_disabled = 1; +#ifndef CONFIG_X86_MEM_HOTADD /* * highmem=size forces highmem to be exactly 'size' bytes. * This works even on boxes that have no highmem otherwise. @@ -519,6 +532,7 @@ */ if (c == ' ' && !memcmp(from, "highmem=", 8)) highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; +#endif /* !CONFIG_X86_MEM_HOTADD */ c = *(from++); if (!c) @@ -617,6 +631,54 @@ } #ifndef CONFIG_DISCONTIGMEM + +#ifdef CONFIG_X86_MEM_HOTADD + +/* + * Reserve enough virtual addresses to store page structures for + * hot-added memory. + * Needs work: Should the reserve_size depend on a configuration + * option? + */ +static void __init find_hotadd_reserve_size(unsigned long max_pfn) +{ + if (! mem_hotadd_disabled) { + +#ifndef CONFIG_X86_PAE + if (max_pfn > MAX_NONPAE_PFN) + hotadd_reserve_size = 0; + else + hotadd_reserve_size = (MAX_NONPAE_PFN - max_pfn) * + sizeof(struct page); +#else /* CONFIG_X86_PAE */ + if (max_pfn >= MAX_PAE_PFN) + hotadd_reserve_size = 0; + else + hotadd_reserve_size = (MAX_PAE_PFN - max_pfn) * + sizeof(struct page); + +#endif /* CONFIG_X86_PAE */ + + if (hotadd_reserve_size % PMD_SIZE) + hotadd_reserve_size += (PMD_SIZE - + (hotadd_reserve_size % PMD_SIZE)); + } +} + +static unsigned long +__init set_hotadd_reserve_start(unsigned long max_low_pfn) +{ + if (hotadd_reserve_size) { + hotadd_reserve_start = (unsigned long)__va(max_low_pfn * PAGE_SIZE); + return (max_low_pfn + (hotadd_reserve_size >> PAGE_SHIFT)); + } + else { + hotadd_reserve_start = 0; + return 0; + } +} +#endif /* CONFIG_X86_MEM_HOTADD */ + /* * Register fully available low RAM pages with the bootmem allocator. */ @@ -657,10 +719,20 @@ } } +#if defined (CONFIG_BLK_DEV_INITRD) && defined(CONFIG_X86_MEM_HOTADD) +static unsigned long __initdata initrd_high_start; +static unsigned long __initdata initrd_high_end; +static int initrd_high = 0; +#endif /* CONFIG_BLK_DEV_INITRD && CONFIG_X86_MEM_HOTADD */ + static unsigned long __init setup_memory(void) { unsigned long bootmap_size, start_pfn, max_low_pfn; +#ifdef CONFIG_X86_MEM_HOTADD + unsigned long saved_low_pfn = 0; +#endif /* CONFIG_X86_MEM_HOTADD */ + /* * partially used pages are not usable - thus * we are rounding upwards: @@ -669,6 +741,10 @@ find_max_pfn(); +#ifdef CONFIG_X86_MEM_HOTADD + find_hotadd_reserve_size(max_pfn); +#endif + max_low_pfn = find_max_low_pfn(); #ifdef CONFIG_HIGHMEM @@ -681,6 +757,11 @@ #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); + +#ifdef CONFIG_X86_MEM_HOTADD + saved_low_pfn = set_hotadd_reserve_start(max_low_pfn); +#endif + /* * Initialize the boot-time allocator (with low memory only): */ @@ -732,6 +813,35 @@ INITRD_START ? INITRD_START + PAGE_OFFSET : 0; initrd_end = initrd_start+INITRD_SIZE; } +#ifdef CONFIG_X86_MEM_HOTADD + else if (saved_low_pfn && (INITRD_START + INITRD_SIZE <= + (saved_low_pfn << PAGE_SHIFT))) { + unsigned long initrd_low_start; + unsigned long initrd_low_end; + unsigned long initrd_low_size; + + initrd_high = 1; + initrd_low_size = initrd_low_start = 0; + initrd_high_start = 0; + if ((max_low_pfn << PAGE_SHIFT) > INITRD_START) { + + initrd_low_start = INITRD_START + PAGE_OFFSET; + initrd_low_size = (max_low_pfn << PAGE_SHIFT) - + initrd_start; + initrd_low_end = initrd_low_start + + initrd_low_size; + reserve_bootmem(INITRD_START, initrd_low_size); + } + initrd_high_end = INITRD_START + INITRD_SIZE + + PAGE_OFFSET; + initrd_high_start = initrd_high_end - + (INITRD_SIZE - initrd_low_size); + initrd_start = (initrd_low_start) ? initrd_low_start : + initrd_high_start; + initrd_end = initrd_high_end; + + } +#endif else { printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", @@ -959,6 +1069,12 @@ smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ #endif paging_init(); + +#if defined (CONFIG_BLK_DEV_INITRD) && defined(CONFIG_X86_MEM_HOTADD) + if (initrd_high && initrd_high_start) + initrd_map_high (initrd_high_start, initrd_high_end); +#endif + #ifdef CONFIG_ACPI_BOOT /* * Parse the ACPI tables for possible boot-time SMP configuration. diff -Naur -X dontdiff linux-2.5.69-ref/arch/i386/mm/Makefile linux-2.5.69/arch/i386/mm/Makefile --- linux-2.5.69-ref/arch/i386/mm/Makefile Sun May 4 18:53:08 2003 +++ linux-2.5.69/arch/i386/mm/Makefile Mon Jun 23 10:26:00 2003 @@ -8,3 +8,4 @@ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o +obj-$(CONFIG_X86_MEM_HOTADD) += mem_hotadd.o diff -Naur -X dontdiff linux-2.5.69-ref/arch/i386/mm/fault.c linux-2.5.69/arch/i386/mm/fault.c --- linux-2.5.69-ref/arch/i386/mm/fault.c Sun May 4 18:52:48 2003 +++ linux-2.5.69/arch/i386/mm/fault.c Mon Jun 23 10:26:00 2003 @@ -413,6 +413,10 @@ goto no_context; set_pmd(pmd, *pmd_k); + /* If PMD points to a 4MB page, no PTEs to update */ + if (pmd_has_pse(*pmd_k)) + return; + pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; diff -Naur -X dontdiff linux-2.5.69-ref/arch/i386/mm/init.c linux-2.5.69/arch/i386/mm/init.c --- linux-2.5.69-ref/arch/i386/mm/init.c Sun May 4 18:53:36 2003 +++ linux-2.5.69/arch/i386/mm/init.c Mon Jun 23 10:26:00 2003 @@ -562,15 +562,94 @@ } #ifdef CONFIG_BLK_DEV_INITRD + +#ifdef CONFIG_X86_MEM_HOTADD +void initrd_map_high(unsigned long start, unsigned long end) +{ + unsigned long align_start = start & PMD_MASK; + unsigned long align_end = (end + PMD_SIZE - 1) & PMD_MASK; + unsigned long base; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte, *pte_base; + int count = (align_end - align_start) / PMD_SIZE; + int i; + + pgd = pgd_offset_k(align_start); + pmd = pmd_offset(pgd, align_start); + base = align_start; + if (cpu_has_pse) { + while (count--) { + set_pmd(pmd,__pmd(_KERNPG_TABLE + _PAGE_PSE + __pa(base))); + base += PMD_SIZE; + pmd++; + } + } + else { + while (count--) { + pte_base = pte = alloc_bootmem_low_pages(PAGE_SIZE); + for (i=0; i < PTRS_PER_PTE; i++, pte++) { + set_pte(pte, pfn_pte((__pa(base) >> PAGE_SHIFT), + PAGE_KERNEL)); + base += PAGE_SIZE; + } + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); + pmd++; + } + } +} + +static void initrd_unmap_high(unsigned long start, unsigned long end) +{ + unsigned long align_start = start & PMD_MASK; + unsigned long align_end = (end + PMD_SIZE - 1) & PMD_MASK; + unsigned long base; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int count = (align_end - align_start) / PMD_SIZE; + + pgd = pgd_offset_k(align_start); + pmd = pmd_offset(pgd, align_start); + base = align_start; + + while (count--) { + if (!pmd_has_pse(*pmd)) { + pte = (pmd_val(*pmd) & PAGE_MASK); + free_bootmem((unsigned long)pte, PAGE_SIZE); + } + pmd_clear(pmd); + base += PMD_SIZE; + pmd++; + } +} +#endif /* CONFIG_X86_MEM_HOTADD */ + void free_initrd_mem(unsigned long start, unsigned long end) { + unsigned long high_start = __va(max_low_pfn << PAGE_SHIFT); + if (start < end) printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); + +#ifndef CONFIG_X86_MEM_HOTADD for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); set_page_count(virt_to_page(start), 1); free_page(start); totalram_pages++; } +#else + for (; (start < end) && (start < high_start); start += PAGE_SIZE) { + ClearPageReserved(virt_to_page(start)); + set_page_count(virt_to_page(start), 1); + free_page(start); + totalram_pages++; + } + + if (start < end) + initrd_unmap_high(start, end); +#endif /* !CONFIG_X86_MEM_HOTADD */ + } -#endif +#endif /* CONFIG_BLK_DEV_INITRD */ diff -Naur -X dontdiff linux-2.5.69-ref/arch/i386/mm/mem_hotadd.c linux-2.5.69/arch/i386/mm/mem_hotadd.c --- linux-2.5.69-ref/arch/i386/mm/mem_hotadd.c Wed Dec 31 18:00:00 1969 +++ linux-2.5.69/arch/i386/mm/mem_hotadd.c Mon Jun 23 10:26:00 2003 @@ -0,0 +1,446 @@ +/* + * arch/i386/mem_hotadd.c + * (c) 2002 Hewlett-Packard Development Company, L.P. + */ + +#include +#include +#include + +#ifdef CONFIG_MODULES +#include +#include +#endif + +/* + * The functions in this file integrate added memory into the system. + * Flow: + * 1. Calculate size of data structures needed + * 2. Make part of the new memory accesible by setting PGDs. Allocate + * data structures in this area + * 3. Initialize data structures - pg_data_t and mem map + * 4. Reserve memory used up by data structures + * 5. Add rest of the memory to free list + * 6. Update global variables + */ + +int mem_hotadd_count = 0; +static DECLARE_MUTEX(mem_hotadd_sem); + +extern unsigned long hotadd_reserve_size; +extern unsigned long hotadd_reserve_start; +extern unsigned long totalram_pages; +extern unsigned long totalhigh_pages; +extern unsigned long blk_max_pfn; +extern int numnodes; + +extern int kswapd(void *); + +static unsigned long hotadd_vaddr_start = -1; +static unsigned long hotadd_vaddr_left = -1; + +static void hotadd_mem_cleanup(unsigned long, unsigned long); + +#ifdef CONFIG_X86_PAE +#define kaddr_t unsigned long long +#else +#define kaddr_t unsigned long +#endif + +/* + * hotadd_mem_bootstrap(): Map part of the newly added memory to the + * kernel address space, starting at vaddr_start. + * Return size of the mapped memory (in bytes). + */ + +static int +hotadd_mem_bootstrap (kaddr_t paddr_start, unsigned long totalpages) +{ + int required; + kaddr_t paddr; + unsigned long vaddr, vaddr_start, vaddr_end, pfn; + int i,j,k; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte, *pte_base; + + /* + * Calculate data structure sizes: pg_data_t and memory map + * Also get the number of PGDs required to access that amount + * of memory + */ +#if 0 + int map_size; + map_size = (sizeof(struct page) * (totalpages+1)); + required = map_size + sizeof(pg_data_t); +#endif + + vaddr_start = hotadd_vaddr_start; + required = (sizeof(struct page) * totalpages); + + if (hotadd_vaddr_left < required) + return -ENOSPC; + + + printk ("hotadd_mem_bootstrap: Initializing %lu memory pages for data structures\n", (required / PAGE_SIZE)); + /* + * Create PGD and PMD entries and set them in the global page directory + */ + vaddr_end = vaddr_start + required; + vaddr = vaddr_start; + paddr = paddr_start; + + while (vaddr < vaddr_end) { + + i = pgd_index(vaddr); + j = pmd_index(vaddr); + k = pte_index(vaddr); + pgd = (pgd_t *)(swapper_pg_dir + i); + pmd = pmd_offset(pgd, vaddr); + + if ( ((vaddr % PMD_SIZE) == 0) && + ((paddr % PMD_SIZE) == 0) && + (required >= PMD_SIZE) && + (cpu_has_pse)) { + unsigned long prot; + + prot = _KERNPG_TABLE + _PAGE_PSE; + pfn = (paddr >> PAGE_SHIFT); + + /* Make it "global" too if supported */ + if (cpu_has_pge) { + prot += _PAGE_GLOBAL; + } + set_pmd(pmd, pfn_pmd(pfn, __pgprot(prot))); + paddr += PMD_SIZE; + vaddr += PMD_SIZE; + required -= PMD_SIZE; + continue; + } + else { + int k_max = (required + PAGE_SIZE - 1) / PAGE_SIZE; + if ((k + k_max) > PTRS_PER_PTE) + k_max = PTRS_PER_PTE; + else + k_max += k; + + if (pmd_val(*pmd)) + pte_base = (pte_t *) __va(pmd_val(*pmd) & PAGE_MASK); + else { + pte_base = (pte_t *) kmalloc(PAGE_SIZE,GFP_KERNEL); + if (pte_base == NULL) { + printk("hotadd_mem_bootstrap: " + "kmalloc failed. " + "Cleaning up..\n"); + hotadd_mem_cleanup(vaddr_start, vaddr); + return -ENOMEM; + } + memset(pte_base,0,PAGE_SIZE); + } + pte = pte_base + k; + for (; k < k_max; pte++, k++) { + pfn = (paddr >> PAGE_SHIFT); + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + paddr += PAGE_SIZE; + vaddr += PAGE_SIZE; + required -= PAGE_SIZE; + } + if (! pmd_val(*pmd)) { + pfn = ((__pa(pte_base)) >> PAGE_SHIFT); + set_pmd(pmd, pfn_pmd(pfn, + __pgprot(_KERNPG_TABLE))); + } + } + } + + return (vaddr_end - vaddr_start); +} + +/* + * hotadd_init_pgdat(): Initialize the pg_data_t and page structures + * needed for newly added memory. Reserve used pages and add rest of + * the pages to free list. + */ +static int +hotadd_init_pgdat(unsigned long totalpages, kaddr_t paddr_start, int used) +{ + struct page *map; + pg_data_t *pgdat, *temp_pgdat; + unsigned long zone_sizes[MAX_NR_ZONES]; + unsigned long zhole_sizes[MAX_NR_ZONES]; + unsigned long vaddr_start, new_max_pfn; + int map_size; + int i, ret = 0; + + vaddr_start = hotadd_vaddr_start; + + for (i=0; i> PAGE_SHIFT), + zhole_sizes, 0); + + if (ret < 0) { + kfree (pgdat); + return ret; + } + + /* Mark as reserved all pages corresponding to memory used earlier */ + for (i=0; i <= ((used + PAGE_SIZE - 1) / PAGE_SIZE); i++) + SetPageReserved(map+i); + + printk ("hotadd_init_pgdat: Reserved %d pages\n", i); + + /* Add the new pg_data_t to the linked list */ + temp_pgdat = pgdat_list; + while (temp_pgdat->pgdat_next) + temp_pgdat = temp_pgdat->pgdat_next; + + temp_pgdat->pgdat_next = pgdat; + mb(); + + mem_hotadd_count++; + mb(); + + kernel_thread (kswapd, pgdat, CLONE_KERNEL); + + /* This should be done before adding new pages to free list. */ + new_max_pfn = (pgdat->node_start_pfn + pgdat->node_size); + if (blk_max_pfn < new_max_pfn) { + blk_max_pfn = new_max_pfn; + mb(); + } + + /* Add all (un-reserved) pages to free list */ + for (i=0; iflags)); + if (! PageReserved(map+i)) { + set_page_count(map+i, 1); + __free_page(map+i); + } + } + return 0; +} + +static void +hotadd_init_done(unsigned long totalpages, int used, unsigned long end_pfn) +{ + unsigned long next_start; + + printk("hotadd_init_done: Updating global variables\n"); + + next_start = (hotadd_vaddr_start + used + PAGE_SIZE - 1) & PAGE_MASK; + hotadd_vaddr_left -= (next_start - hotadd_vaddr_start); + hotadd_vaddr_start = next_start; + + /* + * Update globals. + */ + numnodes++; + num_physpages += totalpages; + totalram_pages += totalpages; + totalhigh_pages += totalpages; + if (max_mapnr < end_pfn) { + max_mapnr = end_pfn; + highend_pfn = end_pfn; + } +} + +static void +hotadd_mem_cleanup(unsigned long vaddr_start, unsigned long vaddr_end) +{ + unsigned long vaddr = vaddr_start; + int i, j; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte_base, *pte; + + i = pgd_index(vaddr); + j = pte_index(vaddr); + pgd = (pgd_t *)(swapper_pg_dir + i); + pmd = pmd_offset(pgd, vaddr); + + if (! pmd_val(*pmd)) + BUG(); + + if (pmd_has_pse(*pmd)) { + pmd_clear(pmd); + vaddr += PMD_SIZE; + } + else { + pte_base = (pte_t *) __va(pmd_val(*pmd) & PAGE_MASK); + + pte = pte_base + j; + for (; j < PTRS_PER_PTE; pte++, j++) { + pte_clear(pte); + vaddr += PAGE_SIZE; + } + } + + while (vaddr < vaddr_end) { + + i = pgd_index(vaddr); + pgd = (pgd_t *)(swapper_pg_dir + i); + pmd = pmd_offset(pgd, vaddr); + + if (! pmd_val(*pmd)) + BUG(); + + if (! pmd_has_pse(*pmd)) { + pte_base = (pte_t *) __va(pmd_val(*pmd) & PAGE_MASK); + kfree(pte_base); + } + pmd_clear(pmd); + vaddr += PMD_SIZE; + } +} + + +/* + * hotadd_mem_init(): Integrate hot-added memory into the system. + */ +int hotadd_mem_init(unsigned long long start, unsigned long long size, + int flags) +{ +#ifndef CONFIG_X86_PAE + unsigned long paddr_start = (unsigned long)start; +#else + unsigned long long paddr_start = start; +#endif + pg_data_t *pgdat; + unsigned long pages = size / PAGE_SIZE; + unsigned long totalpages; + unsigned long start_pfn = (paddr_start >> PAGE_SHIFT); + unsigned long end_pfn = start_pfn + pages - 1; + unsigned long old_flags; + int used; + int ret = 0; + + printk ("Attempting to hot-add memory. Start address = 0x%Lx, Size = 0x%Lx (%lu pages)\n", start, size, pages); + + printk ("Free pages before hot-add: %u\n\n", nr_free_pages()); + + if (paddr_start & ~PAGE_MASK) + return -EINVAL; + + /* + * Adding to DMA or Normal zones is not allowed. + */ + if (paddr_start < __pa(high_memory)) + return -EINVAL; + +#ifndef CONFIG_X86_PAE + if (start_pfn >= MAX_NONPAE_PFN) + return -E2BIG; +#else + if (start_pfn >= MAX_PAE_PFN) + return -E2BIG; +#endif + if (numnodes == MAX_NR_NODES) + return -E2BIG; + +#ifndef CONFIG_X86_PAE + totalpages = (end_pfn < MAX_NONPAE_PFN) ? pages : + (MAX_NONPAE_PFN - start_pfn); + +#else + totalpages = (end_pfn < MAX_PAE_PFN) ? pages : + (MAX_PAE_PFN - start_pfn); +#endif + end_pfn = start_pfn + totalpages - 1; + + if (totalpages < pages) { + printk ("hotadd_mem_init: Can only add %lu pages out of %lu\n", totalpages, pages); + } + + down(&mem_hotadd_sem); + + /* + * Check for overlapping address ranges (should never happen..) + */ + pgdat = pgdat_list; + while (pgdat) { + kaddr_t pgdat_start = (pgdat->node_start_pfn); + kaddr_t pgdat_end = pgdat_start + pgdat->node_size - 1; + + if ((start_pfn >= pgdat_start) && (start_pfn <= pgdat_end)) { + printk ("hotadd_mem_init: Overlapping memory region\n"); + ret = -EEXIST; + goto err; + } + + if ((end_pfn >= pgdat_start) && (end_pfn <= pgdat_end)) { + printk ("hotadd_mem_init: Overlapping memory region\n"); + ret = -EEXIST; + goto err; + } + pgdat = pgdat->pgdat_next; + } + + if (hotadd_vaddr_start == -1) + hotadd_vaddr_start = hotadd_reserve_start; + if (hotadd_vaddr_left == -1) + hotadd_vaddr_left = hotadd_reserve_size; + + /* Return if the reserved virtual address range is used up */ + if (hotadd_vaddr_left == 0) { + ret = -ENOMEM; + goto err; + } + + old_flags = current->flags; + current->flags |= PF_MEMALLOC; + + used = hotadd_mem_bootstrap(paddr_start, totalpages); + + if (used < 0) { + printk("hotadd_mem_init: Failed to initialize memory for data structures\n"); + ret = used; + goto done; + } + + ret = hotadd_init_pgdat(totalpages, paddr_start, used); + if (ret < 0) { + printk ("hotadd_init_pgdat failed. Cleaning up...\n"); + hotadd_mem_cleanup(hotadd_vaddr_start, + (hotadd_vaddr_start + used)); + goto done; + } + + hotadd_init_done(totalpages, used, end_pfn); + + printk ("Memory hot-add operation completed successfully\n"); + printk ("Free pages after hot-add: %u\n\n", nr_free_pages()); +done: + current->flags = old_flags; +err: + up(&mem_hotadd_sem); + return ret; +} + +EXPORT_SYMBOL(hotadd_mem_init); +EXPORT_SYMBOL(mem_hotadd_count); diff -Naur -X dontdiff linux-2.5.69-ref/include/asm-i386/mem_hotadd.h linux-2.5.69/include/asm-i386/mem_hotadd.h --- linux-2.5.69-ref/include/asm-i386/mem_hotadd.h Wed Dec 31 18:00:00 1969 +++ linux-2.5.69/include/asm-i386/mem_hotadd.h Mon Jun 23 10:26:00 2003 @@ -0,0 +1,60 @@ +/* + * linux/include/asm-i386/mem_hotadd.h + */ + +#ifndef __ASM_MEM_HOTADD_H +#define __ASM_MEM_HOTADD_H + +/* + * Re-defines macros defined elsewhere to handle the case of multiple + * pg_data_t structures and memory maps + */ + +extern int mem_hotadd_count; + +extern unsigned long num_physpages; + +#define MAX_PAE_PFN (1 << 23) + +/* + * Return a pointer to the pg_data_t structure corresponding to + * a given page frame number. + */ +static inline struct pglist_data * __pfn_to_pgdat(unsigned long pfn) +{ + struct pglist_data *temp; + + if ((mem_hotadd_count == 0) && (pfn < num_physpages)) + return (pgdat_list); + + temp = pgdat_list; + while (temp) { + if ((pfn >= temp->node_start_pfn) && + (pfn < (temp->node_start_pfn + temp->node_size))) + return temp; + temp = temp->pgdat_next; + } + + return 0; +} + +#define pfn_to_page(pfn) \ +({ \ + struct pglist_data *__temp = __pfn_to_pgdat((pfn)); \ + unsigned long __index = (pfn) - __temp->node_start_pfn; \ + (struct page *) (__temp->node_mem_map + __index); \ +}) + +#define page_to_pfn(pg) \ +({ \ + struct page *__page = pg; \ + struct zone *__zone = page_zone(__page); \ + unsigned long __index = (__page - __zone->zone_mem_map); \ + (unsigned long)(__zone->zone_start_pfn + __index); \ +}) + +#define pfn_valid(pfn) ((__pfn_to_pgdat(pfn) != 0)) + +extern int hotadd_mem_init(unsigned long long, unsigned long long, int); + +#endif /* ASM_MEM_HOTADD_H */ diff -Naur -X dontdiff linux-2.5.69-ref/include/asm-i386/page.h linux-2.5.69/include/asm-i386/page.h --- linux-2.5.69-ref/include/asm-i386/page.h Sun May 4 18:53:02 2003 +++ linux-2.5.69/include/asm-i386/page.h Mon Jun 23 10:26:00 2003 @@ -123,15 +123,28 @@ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) + +#ifdef CONFIG_X86_MEM_HOTADD + +#ifndef __ASSEMBLY__ +extern unsigned long hotadd_reserve_size; +#endif /* __ASSEMBLY__ */ +#define __MAX_RDMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define MAXMEM ((-PAGE_OFFSET-VMALLOC_RESERVE-hotadd_reserve_size)) + +#else #define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#endif /* CONFIG_X86_MEM_HOTADD */ + #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifndef CONFIG_DISCONTIGMEM +#if (! defined CONFIG_DISCONTIGMEM) && (! defined CONFIG_X86_MEM_HOTADD) #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* !CONFIG_DISCONTIGMEM && !CONFIG_X86_MEM_HOTADD */ + #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) diff -Naur -X dontdiff linux-2.5.69-ref/include/asm-i386/pgtable.h linux-2.5.69/include/asm-i386/pgtable.h --- linux-2.5.69-ref/include/asm-i386/pgtable.h Sun May 4 18:53:36 2003 +++ linux-2.5.69/include/asm-i386/pgtable.h Mon Jun 23 10:26:00 2003 @@ -83,8 +83,16 @@ * area for the same reason. ;) */ #define VMALLOC_OFFSET (8*1024*1024) + +#ifdef CONFIG_X86_MEM_HOTADD +extern unsigned long hotadd_reserve_size; +#define VMALLOC_START (((unsigned long) high_memory + hotadd_reserve_size + \ + 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) +#else #define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \ ~(VMALLOC_OFFSET-1)) +#endif + #define VMALLOC_VMADDR(x) ((unsigned long)(x)) #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) @@ -184,7 +192,7 @@ #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) - +#define pmd_has_pse(x) (pmd_val(x) & _PAGE_PSE) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) diff -Naur -X dontdiff linux-2.5.69-ref/include/linux/mm.h linux-2.5.69/include/linux/mm.h --- linux-2.5.69-ref/include/linux/mm.h Sun May 4 18:53:00 2003 +++ linux-2.5.69/include/linux/mm.h Mon Jun 23 10:26:00 2003 @@ -189,6 +189,10 @@ #endif /* CONFIG_HIGMEM || WANT_PAGE_VIRTUAL */ }; +#ifdef CONFIG_X86_MEM_HOTADD +#include +#endif + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) @@ -483,11 +487,11 @@ } extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, +extern int free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, unsigned long * zones_size, unsigned long zone_start_pfn, - unsigned long *zholes_size); + unsigned long *zholes_size, int boot_flag); extern void memmap_init_zone(struct page *, unsigned long, int, - unsigned long, unsigned long); + unsigned long, unsigned long, int); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); diff -Naur -X dontdiff linux-2.5.69-ref/include/linux/mmzone.h linux-2.5.69/include/linux/mmzone.h --- linux-2.5.69-ref/include/linux/mmzone.h Sun May 4 18:53:31 2003 +++ linux-2.5.69/include/linux/mmzone.h Mon Jun 23 10:26:00 2003 @@ -249,6 +249,30 @@ #define for_each_zone(zone) \ for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) +/* + * next_node_zone - helper for for_each_node_zone() + */ +static inline struct zone * next_node_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + int index = zone - (pgdat->node_zones); + + if (pgdat->pgdat_next) { + pgdat = pgdat->pgdat_next; + zone = pgdat->node_zones + index; + } + else + zone = NULL; + return zone; +} + +/* + * for_each_node_zone - macro to iterate over a given type + * of zone (say HIGHMEM) in all nodes. + */ +#define for_each_node_zone(zone) \ + for (; zone; zone = next_node_zone(zone)) + #ifdef CONFIG_NUMA #define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ #else /* !CONFIG_NUMA */ @@ -263,7 +287,17 @@ extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map + +#ifndef CONFIG_X86_MEM_HOTADD #define MAX_NR_NODES 1 +#else +#define MAX_NR_NODES (255 / MAX_NR_ZONES) +#endif + +#ifdef CONFIG_X86_MEM_HOTADD +#include +#endif + #else /* CONFIG_DISCONTIGMEM */ #include diff -Naur -X dontdiff linux-2.5.69-ref/mm/page_alloc.c linux-2.5.69/mm/page_alloc.c --- linux-2.5.69-ref/mm/page_alloc.c Sun May 4 18:53:01 2003 +++ linux-2.5.69/mm/page_alloc.c Mon Jun 23 10:26:00 2003 @@ -46,11 +46,12 @@ */ struct zone *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; EXPORT_SYMBOL(zone_table); +EXPORT_SYMBOL(pgdat_list); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; -static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; -static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; -static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 128, }; +static int zone_balance_min[MAX_NR_ZONES] = { 20 , 20, 20, }; +static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, }; /* * Temporary debugging check for pages not lying within a given zone. @@ -554,36 +555,60 @@ min = 1UL << order; for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; + int min_incr = 1; + unsigned long local_min = 0; - min += z->pages_low; - if (z->free_pages >= min || + for_each_node_zone (z) { + if (! z->present_pages) + continue; + if (min_incr) { + min += z->pages_low; + local_min = z->pages_low; + min_incr = 0; + } + if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, cold); - if (page) - return page; + page = buffered_rmqueue(z, order, cold); + if (page) + return page; + } } - min += z->pages_low * sysctl_lower_zone_protection; + min += local_min * sysctl_lower_zone_protection; } /* we're somewhat low on memory, failed to find what we needed */ - for (i = 0; zones[i] != NULL; i++) - wakeup_kswapd(zones[i]); + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + for_each_node_zone(z) { + if (! z->present_pages) + continue; + wakeup_kswapd(z); + } + } /* Go through the zonelist again, taking __GFP_HIGH into account */ min = 1UL << order; for (i = 0; zones[i] != NULL; i++) { - unsigned long local_min; + unsigned long local_min = 0; struct zone *z = zones[i]; + int min_incr = 1; - local_min = z->pages_min; - if (gfp_mask & __GFP_HIGH) - local_min >>= 2; - min += local_min; - if (z->free_pages >= min || + for_each_node_zone (z) { + if (! z->present_pages) + continue; + if (min_incr) { + local_min = z->pages_min; + if (gfp_mask & __GFP_HIGH) + local_min >>= 2; + min += local_min; + min_incr = 0; + } + if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, cold); - if (page) - return page; + page = buffered_rmqueue(z, order, cold); + if (page) + return page; + } } min += local_min * sysctl_lower_zone_protection; } @@ -596,9 +621,13 @@ for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; - page = buffered_rmqueue(z, order, cold); - if (page) - return page; + for_each_node_zone (z) { + if (! z->present_pages) + continue; + page = buffered_rmqueue(z, order, cold); + if (page) + return page; + } } goto nopage; } @@ -608,22 +637,39 @@ goto nopage; current->flags |= PF_MEMALLOC; - try_to_free_pages(classzone, gfp_mask, order); + { + struct zone *z = classzone; + for_each_node_zone(z) { + if (! z->present_pages) + continue; + try_to_free_pages(z, gfp_mask, order); + } + } current->flags &= ~PF_MEMALLOC; /* go through the zonelist yet one more time */ min = 1UL << order; for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; + int min_incr = 1; + unsigned long local_min = 0; - min += z->pages_min; - if (z->free_pages >= min || + for_each_node_zone (z) { + if (! z->present_pages) + continue; + if (min_incr) { + min += z->pages_min; + local_min = z->pages_low; + min_incr = 0; + } + if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, cold); - if (page) - return page; + page = buffered_rmqueue(z, order, cold); + if (page) + return page; + } } - min += z->pages_low * sysctl_lower_zone_protection; + min += local_min * sysctl_lower_zone_protection; } /* @@ -1006,7 +1052,7 @@ /* * Builds allocation fallback zone lists. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) { switch (k) { struct zone *zone; @@ -1014,12 +1060,12 @@ BUG(); case ZONE_HIGHMEM: zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { #ifndef CONFIG_HIGHMEM + if (zone->present_pages) { BUG(); -#endif - zonelist->zones[j++] = zone; } +#endif + zonelist->zones[j++] = zone; case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; if (zone->present_pages) @@ -1033,7 +1079,7 @@ return j; } -static void __init build_zonelists(pg_data_t *pgdat) +static void build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; @@ -1053,6 +1099,8 @@ k = ZONE_DMA; j = build_zonelists_node(pgdat, zonelist, j, k); + +#ifdef CONFIG_DISCONTIGMEM /* * Now we build the zonelist so that it contains the zones * of all the other nodes. @@ -1065,7 +1113,7 @@ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); for (node = 0; node < local_node; node++) j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - +#endif zonelist->zones[j++] = NULL; } } @@ -1122,7 +1170,7 @@ #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) -static void __init calculate_zone_totalpages(struct pglist_data *pgdat, +static void calculate_zone_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; @@ -1142,8 +1190,8 @@ /* * Get space for the valid bitmap. */ -static void __init calculate_zone_bitmap(struct pglist_data *pgdat, - unsigned long *zones_size) +static int calculate_zone_bitmap(struct pglist_data *pgdat, + unsigned long *zones_size, int boot_flag) { unsigned long size = 0; int i; @@ -1151,8 +1199,16 @@ for (i = 0; i < MAX_NR_ZONES; i++) size += zones_size[i]; size = LONG_ALIGN((size + 7) >> 3); - pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size); + if (boot_flag) { + pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size); + } + else { + if ((pgdat->valid_addr_bitmap = (unsigned long *)kmalloc(size, GFP_KERNEL)) == NULL) + return -ENOMEM; + } memset(pgdat->valid_addr_bitmap, 0, size); + + return 0; } /* @@ -1160,15 +1216,17 @@ * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(struct page *start, unsigned long size, int nid, - unsigned long zone, unsigned long start_pfn) +void memmap_init_zone(struct page *start, unsigned long size, int nid, + unsigned long zone, unsigned long start_pfn, int boot_flag) { struct page *page; for (page = start; page < (start + size); page++) { set_page_zone(page, nid * MAX_NR_ZONES + zone); - set_page_count(page, 0); - SetPageReserved(page); + if (boot_flag) { + set_page_count(page, 0); + SetPageReserved(page); + } INIT_LIST_HEAD(&page->list); #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ @@ -1180,8 +1238,8 @@ } #ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(start, size, nid, zone, start_pfn) \ - memmap_init_zone((start), (size), (nid), (zone), (start_pfn)) +#define memmap_init(start, size, nid, zone, start_pfn, boot_flag) \ + memmap_init_zone((start), (size), (nid), (zone), (start_pfn), (boot_flag)) #endif /* @@ -1190,8 +1248,9 @@ * - mark all memory queues empty * - clear the memory bitmaps */ -static void __init free_area_init_core(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) +static int free_area_init_core(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size, + int boot_flag) { unsigned long i, j; const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); @@ -1269,9 +1328,19 @@ zone->wait_table_size = wait_table_size(size); zone->wait_table_bits = wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size + if (boot_flag) { + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size * sizeof(wait_queue_head_t)); + } + else { + zone->wait_table = (wait_queue_head_t *) + kmalloc(zone->wait_table_size * sizeof(wait_queue_head_t),GFP_KERNEL); + if (zone->wait_table == NULL) + return -ENOMEM; + memset(zone->wait_table, 0, zone->wait_table_size + * sizeof(wait_queue_head_t)); + } for(i = 0; i < zone->wait_table_size; ++i) init_waitqueue_head(zone->wait_table + i); @@ -1293,7 +1362,7 @@ if ((zone_start_pfn) & (zone_required_alignment-1)) printk("BUG: wrong zone alignment, it will crash\n"); - memmap_init(lmem_map, size, nid, j, zone_start_pfn); + memmap_init(lmem_map, size, nid, j, zone_start_pfn, boot_flag); zone_start_pfn += size; lmem_map += size; @@ -1332,31 +1401,71 @@ */ bitmap_size = (size-1) >> (i+4); bitmap_size = LONG_ALIGN(bitmap_size+1); - zone->free_area[i].map = - (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + if (boot_flag) { + zone->free_area[i].map = (unsigned long *) + alloc_bootmem_node(pgdat, bitmap_size); + } + else { + zone->free_area[i].map = (unsigned long *) + kmalloc(bitmap_size, GFP_KERNEL); + if (zone->free_area[i].map == NULL) + return -ENOMEM; + memset(zone->free_area[i].map,0,bitmap_size); + } } } + + return 0; } -void __init free_area_init_node(int nid, struct pglist_data *pgdat, +int free_area_init_node(int nid, struct pglist_data *pgdat, struct page *node_mem_map, unsigned long *zones_size, - unsigned long node_start_pfn, unsigned long *zholes_size) + unsigned long node_start_pfn, unsigned long *zholes_size, + int boot_flag) { unsigned long size; + int i, j; pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_zone_totalpages(pgdat, zones_size, zholes_size); if (!node_mem_map) { + BUG_ON (boot_flag == 0); size = (pgdat->node_size + 1) * sizeof(struct page); node_mem_map = alloc_bootmem_node(pgdat, size); } pgdat->node_mem_map = node_mem_map; - free_area_init_core(pgdat, zones_size, zholes_size); - memblk_set_online(node_to_memblk(nid)); + if (free_area_init_core(pgdat, zones_size, zholes_size, boot_flag) < 0) + goto err_cleanup; + + if (boot_flag) + memblk_set_online(node_to_memblk(nid)); + + if (! boot_flag) + build_zonelists(pgdat); + + if (calculate_zone_bitmap(pgdat, zones_size, boot_flag) < 0) + goto err_cleanup; + + return 0; + +err_cleanup: + BUG_ON (boot_flag == 1); + printk ("free_area_init_node: kmalloc failed. Cleaning up...\n"); + for (i=0; inode_zones + i; + if (! zone->present_pages) + continue; + if (zone->wait_table) + kfree(zone->wait_table); + for (j=0; j < MAX_ORDER; j++) { + if (zone->free_area[i].map) + kfree(zone->free_area[i].map); + } + } - calculate_zone_bitmap(pgdat, zones_size); + return -ENOMEM; } #ifndef CONFIG_DISCONTIGMEM @@ -1366,7 +1475,7 @@ void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, &contig_page_data, NULL, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL, 1); mem_map = contig_page_data.node_mem_map; } #endif