diff -urNp a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h Thu Aug 1 00:59:07 2002 +++ b/include/linux/mmzone.h Fri Aug 2 00:13:12 2002 @@ -18,6 +18,16 @@ #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif +/* + * NUMA_RATIO is a constant that tries to take latency between + * nodes in account, when allocating memory. + * NUMA_RATIO has to be between 1 and numnodes. The lower + * NUMA_RATIO is, the more you care about allocating pages on + * your local node. The higher it is, the more you don't mind + * getting memory far away from the local node. + */ +#define NUMA_RATIO numnodes + typedef struct free_area_struct { struct list_head free_list; unsigned long *map; @@ -108,6 +118,7 @@ typedef struct pglist_data { unsigned long node_start_mapnr; unsigned long node_size; int node_id; + wait_queue_head_t * kswapd_wait_queue; struct pglist_data *node_next; } pg_data_t; diff -urNp a/include/linux/swap.h b/include/linux/swap.h --- a/include/linux/swap.h Thu Nov 22 20:46:19 2001 +++ b/include/linux/swap.h Fri Aug 2 00:13:12 2002 @@ -110,7 +110,6 @@ extern void FASTCALL(activate_page(struc extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern wait_queue_head_t kswapd_wait; extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); /* linux/mm/page_io.c */ diff -urNp a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c Mon Jul 29 15:23:28 2002 +++ b/mm/page_alloc.c Fri Aug 2 00:13:12 2002 @@ -310,54 +310,115 @@ static struct page * balance_classzone(z */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - unsigned long min; - zone_t **zone, * classzone; + unsigned long min_low, min_min; + zone_t **zone, **current_zone, * classzone, *z; struct page * page; - int freed; + int freed, short_nodes = 0; + struct pglist_data* current_node; zone = zonelist->zones; - classzone = *zone; - min = 1UL << order; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - - min += z->pages_low; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; - } - } - - classzone->need_balance = 1; - mb(); - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + min_low = 1UL << order; + min_min = 1UL << order; zone = zonelist->zones; - min = 1UL << order; - for (;;) { - unsigned long local_min; - zone_t *z = *(zone++); - if (!z) - break; + z = *zone; + for(;;){ + /* + * This loops scans all the zones + */ + min_low = 0; + min_min = 0; + current_node = z->zone_pgdat; + current_zone = zone; + classzone = z; + + printk(KERN_DEBUG "node_id : %d\n", current_node->node_id); + printk(KERN_DEBUG "classzone : %lx <-> node_zones : %lx\n", classzone, current_node->node_zones); + do{ + /* + * This loops scans all the zones of + * the current node. + */ + min_low += z->pages_low; + if (z->free_pages > min_low) { + page = rmqueue(z, order); + if (page) + return page; + } + z = *(++zone); + }while(z && (z->zone_pgdat == current_node)); + /* + * The node is low on memory. + * We mark it as unbalanced, and + * we wake its swap daemon up. + */ + short_nodes++; + classzone->need_balance = 1; + mb(); + if(short_nodes >= NUMA_RATIO){ + /* + * If we're here, that means that at least NUMA_RATIO + * nodes are getting short on memory. + * Let's start the swap daemons on these nodes. + */ + zone = zonelist->zones; + z = *zone; + + if (waitqueue_active(current_node->kswapd_wait_queue)) + wake_up_interruptible(current_node->kswapd_wait_queue); + while(z->zone_pgdat != current_node){ + if (waitqueue_active(z->zone_pgdat->kswapd_wait_queue)) + wake_up_interruptible(z->zone_pgdat->kswapd_wait_queue); + z = *(++zone); + } + } - local_min = z->pages_min; - if (!(gfp_mask & __GFP_WAIT)) - local_min >>= 2; - min += local_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + /* + * We want to try again in the current node. + */ + zone = current_zone; + z = *zone; + do{ + unsigned long local_min; + local_min = z->pages_min; + if (!(gfp_mask & __GFP_WAIT)) + local_min >>= 2; + min_min += local_min; + if (z->free_pages > min_min) { + page = rmqueue(z, order); + if (page) + return page; + } + z = *(++zone); + }while(z && (z->zone_pgdat == current_node)); + + /* + * We need to take a look at the NUMA_RATIO before trying to + * allocate memory brutally. + */ + if (current->flags & (PF_MEMALLOC | PF_MEMDIE)){ + short_nodes = 0; + zone = current_zone; + z = *zone; + do{ + page = rmqueue(z, order); + if (page) + return page; + z = *(++zone); + }while(z && (z->zone_pgdat == current_node)); } + if(!z) + break; } - /* here we're in the low on memory slow path */ - rebalance: + /* + * We were not able to find enough memory. + * Since many swap daemons have been waken up, + * we might be able to find some pages. + * If not, we need to balance the entire memory. + */ + classzone = *zonelist->zones; if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { zone = zonelist->zones; for (;;) { @@ -381,14 +442,14 @@ rebalance: return page; zone = zonelist->zones; - min = 1UL << order; + min_min = 1UL << order; for (;;) { zone_t *z = *(zone++); if (!z) break; - min += z->pages_min; - if (z->free_pages > min) { + min_min += z->pages_min; + if (z->free_pages > min_min) { page = rmqueue(z, order); if (page) return page; diff -urNp a/mm/vmscan.c b/mm/vmscan.c --- a/mm/vmscan.c Mon Feb 25 20:38:14 2002 +++ b/mm/vmscan.c Fri Aug 2 00:13:12 2002 @@ -605,7 +605,6 @@ int try_to_free_pages(zone_t *classzone, return 0; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); static int check_classzone_need_balance(zone_t * classzone) { @@ -620,44 +619,32 @@ static int check_classzone_need_balance( return 1; } -static int kswapd_balance_pgdat(pg_data_t * pgdat) +static void kswapd_balance_pgdat(pg_data_t * pgdat) { - int need_more_balance = 0, i; + int need_more_balance, i; zone_t * zone; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (unlikely(current->need_resched)) - schedule(); - if (!zone->need_balance) - continue; - if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { - zone->need_balance = 0; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - continue; + do{ + need_more_balance = 0; + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (unlikely(current->need_resched)) + schedule(); + if (!zone->need_balance) + continue; + if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { + zone->need_balance = 0; + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + continue; + } + if (check_classzone_need_balance(zone)) + need_more_balance = 1; + else + zone->need_balance = 0; } - if (check_classzone_need_balance(zone)) - need_more_balance = 1; - else - zone->need_balance = 0; - } + }while (need_more_balance); - return need_more_balance; -} - -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; - - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->node_next)); - } while (need_more_balance); + //return need_more_balance; } static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) @@ -675,19 +662,6 @@ static int kswapd_can_sleep_pgdat(pg_dat return 1; } -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; - - pgdat = pgdat_list; - do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); - - return 1; -} /* * The background pageout daemon, started as a kernel thread @@ -702,13 +676,21 @@ static int kswapd_can_sleep(void) * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -int kswapd(void *unused) + + +int kswapd_node(void * unused) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + char kswapd_node_string[16]; + int node_id; + pg_data_t * node; + DECLARE_WAITQUEUE(wait, tsk); + node = (pg_data_t *) unused; + node_id = node->node_id; + sprintf(kswapd_node_string, "kswapd_node_%d", node_id); daemonize(); - strcpy(tsk->comm, "kswapd"); + strcpy(tsk->comm, kswapd_node_string); sigfillset(&tsk->blocked); /* @@ -730,30 +712,46 @@ int kswapd(void *unused) */ for (;;) { __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + add_wait_queue(node->kswapd_wait_queue, &wait); mb(); - if (kswapd_can_sleep()) + if (kswapd_can_sleep_pgdat(node)) schedule(); __set_current_state(TASK_RUNNING); - remove_wait_queue(&kswapd_wait, &wait); + remove_wait_queue(node->kswapd_wait_queue, &wait); /* * If we actually get into a low-memory situation, * the processes needing more memory will wake us * up on a more timely basis. */ - kswapd_balance(); + kswapd_balance_pgdat(node); run_task_queue(&tq_disk); } } +static int kswapd_node_init(int node) +{ + + printk("Starting kswapd on node %d\n", node); + kernel_thread(kswapd_node, (void *)NODE_DATA(node), CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + return 0; +} + static int __init kswapd_init(void) { - printk("Starting kswapd\n"); + int i; + + if(!(NODE_DATA(i)->kswapd_wait_queue = (wait_queue_head_t *)kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL))) + panic("Cannot allocate wait queue array for kswapd daemons"); + swap_setup(); - kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + for(i = 0 ; i < numnodes ; i++){ + init_waitqueue_head(NODE_DATA(i)->kswapd_wait_queue); + kswapd_node_init(i); + } + return 0; }