summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c327
1 files changed, 180 insertions, 147 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 402a504f122..bdd5c432c42 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -45,6 +45,7 @@
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/memcontrol.h>
+#include <linux/debugobjects.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -532,8 +533,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
if (reserved)
return;
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
@@ -546,7 +550,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
/*
* permit the bootmem allocator to evade page validation on high-order frees
*/
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+void __free_pages_bootmem(struct page *page, unsigned int order)
{
if (order == 0) {
__ClearPageReserved(page);
@@ -632,7 +636,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
if (PageReserved(page))
return 1;
- page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
+ page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
set_page_private(page, 0);
@@ -995,8 +999,10 @@ static void free_hot_cold_page(struct page *page, int cold)
if (free_pages_check(page))
return;
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+ debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
+ }
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
@@ -1050,7 +1056,7 @@ void split_page(struct page *page, unsigned int order)
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-static struct page *buffered_rmqueue(struct zonelist *zonelist,
+static struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
@@ -1102,7 +1108,7 @@ again:
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(zonelist, zone);
+ zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
put_cpu();
@@ -1284,7 +1290,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
if (!zlc)
return NULL;
- if (time_after(jiffies, zlc->last_full_zap + HZ)) {
+ if (time_after(jiffies, zlc->last_full_zap + HZ)) {
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
zlc->last_full_zap = jiffies;
}
@@ -1317,7 +1323,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
* We are low on memory in the second scan, and should leave no stone
* unturned looking for a free page.
*/
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
@@ -1328,7 +1334,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
if (!zlc)
return 1;
- i = z - zonelist->zones;
+ i = z - zonelist->_zonerefs;
n = zlc->z_to_n[i];
/* This zone is worth trying if it is allowed but not full */
@@ -1340,7 +1346,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
* zlc->fullzones, so that subsequent attempts to allocate a page
* from that zone don't waste time re-examining it.
*/
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
@@ -1349,7 +1355,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
if (!zlc)
return;
- i = z - zonelist->zones;
+ i = z - zonelist->_zonerefs;
set_bit(i, zlc->fullzones);
}
@@ -1361,13 +1367,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
return NULL;
}
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
return 1;
}
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
}
#endif /* CONFIG_NUMA */
@@ -1377,42 +1383,31 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
* a page.
*/
static struct page *
-get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, int alloc_flags)
+get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
+ struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
{
- struct zone **z;
+ struct zoneref *z;
struct page *page = NULL;
- int classzone_idx = zone_idx(zonelist->zones[0]);
- struct zone *zone;
+ int classzone_idx;
+ struct zone *zone, *preferred_zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
- enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
+
+ (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
+ &preferred_zone);
+ classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- z = zonelist->zones;
-
- do {
- /*
- * In NUMA, this could be a policy zonelist which contains
- * zones that may not be allowed by the current gfp_mask.
- * Check the zone is allowed by the current flags
- */
- if (unlikely(alloc_should_filter_zonelist(zonelist))) {
- if (highest_zoneidx == -1)
- highest_zoneidx = gfp_zone(gfp_mask);
- if (zone_idx(*z) > highest_zoneidx)
- continue;
- }
-
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
- zone = *z;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
goto try_next_zone;
@@ -1433,7 +1428,7 @@ zonelist_scan:
}
}
- page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
+ page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
if (page)
break;
this_zone_full:
@@ -1446,7 +1441,7 @@ try_next_zone:
zlc_active = 1;
did_zlc_setup = 1;
}
- } while (*(++z) != NULL);
+ }
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
@@ -1459,18 +1454,21 @@ try_next_zone:
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist)
+static struct page *
+__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
- struct zone **z;
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct zoneref *z;
+ struct zone *zone;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int do_retry;
int alloc_flags;
- int did_some_progress;
+ unsigned long did_some_progress;
+ unsigned long pages_reclaimed = 0;
might_sleep_if(wait);
@@ -1478,9 +1476,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
return NULL;
restart:
- z = zonelist->zones; /* the list of zones suitable for gfp_mask */
+ z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
- if (unlikely(*z == NULL)) {
+ if (unlikely(!z->zone)) {
/*
* Happens if we have an empty zonelist as a result of
* GFP_THISNODE being used on a memoryless node
@@ -1488,8 +1486,8 @@ restart:
return NULL;
}
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
- zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
if (page)
goto got_pg;
@@ -1504,8 +1502,8 @@ restart:
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
- for (z = zonelist->zones; *z; z++)
- wakeup_kswapd(*z, order);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -1533,7 +1531,8 @@ restart:
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+ high_zoneidx, alloc_flags);
if (page)
goto got_pg;
@@ -1545,8 +1544,8 @@ rebalance:
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
- page = get_page_from_freelist(gfp_mask, order,
- zonelist, ALLOC_NO_WATERMARKS);
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
if (page)
goto got_pg;
if (gfp_mask & __GFP_NOFAIL) {
@@ -1569,7 +1568,7 @@ nofail_alloc:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -1580,12 +1579,12 @@ nofail_alloc:
drain_all_pages();
if (likely(did_some_progress)) {
- page = get_page_from_freelist(gfp_mask, order,
- zonelist, alloc_flags);
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, alloc_flags);
if (page)
goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- if (!try_set_zone_oom(zonelist)) {
+ if (!try_set_zone_oom(zonelist, gfp_mask)) {
schedule_timeout_uninterruptible(1);
goto restart;
}
@@ -1596,21 +1595,22 @@ nofail_alloc:
* a parallel oom killing, we must fail if we're still
* under heavy pressure.
*/
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
- zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+ order, zonelist, high_zoneidx,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET);
if (page) {
- clear_zonelist_oom(zonelist);
+ clear_zonelist_oom(zonelist, gfp_mask);
goto got_pg;
}
/* The OOM killer will not help higher order allocs so fail */
if (order > PAGE_ALLOC_COSTLY_ORDER) {
- clear_zonelist_oom(zonelist);
+ clear_zonelist_oom(zonelist, gfp_mask);
goto nopage;
}
out_of_memory(zonelist, gfp_mask, order);
- clear_zonelist_oom(zonelist);
+ clear_zonelist_oom(zonelist, gfp_mask);
goto restart;
}
@@ -1618,14 +1618,26 @@ nofail_alloc:
* Don't let big-order allocations loop unless the caller explicitly
* requests that. Wait for some write requests to complete then retry.
*
- * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
- * <= 3, but that may not be true in other implementations.
+ * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+ * means __GFP_NOFAIL, but that may not be true in other
+ * implementations.
+ *
+ * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+ * specified, then we retry until we no longer reclaim any pages
+ * (above), or we've reclaimed an order of pages at least as
+ * large as the allocation's order. In both cases, if the
+ * allocation still fails, we stop retrying.
*/
+ pages_reclaimed += did_some_progress;
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
- if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
- (gfp_mask & __GFP_REPEAT))
+ if (order <= PAGE_ALLOC_COSTLY_ORDER) {
do_retry = 1;
+ } else {
+ if (gfp_mask & __GFP_REPEAT &&
+ pages_reclaimed < (1 << order))
+ do_retry = 1;
+ }
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
}
@@ -1646,6 +1658,20 @@ got_pg:
return page;
}
+struct page *
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist)
+{
+ return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+}
+
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
+{
+ return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
+}
+
EXPORT_SYMBOL(__alloc_pages);
/*
@@ -1712,15 +1738,15 @@ EXPORT_SYMBOL(free_pages);
static unsigned int nr_free_zone_pages(int offset)
{
+ struct zoneref *z;
+ struct zone *zone;
+
/* Just pick one node, since fallback list is circular */
- pg_data_t *pgdat = NODE_DATA(numa_node_id());
unsigned int sum = 0;
- struct zonelist *zonelist = pgdat->node_zonelists + offset;
- struct zone **zonep = zonelist->zones;
- struct zone *zone;
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
- for (zone = *zonep++; zone; zone = *zonep++) {
+ for_each_zone_zonelist(zone, z, zonelist, offset) {
unsigned long size = zone->present_pages;
unsigned long high = zone->pages_high;
if (size > high)
@@ -1889,6 +1915,12 @@ void show_free_areas(void)
show_swap_cache_info();
}
+static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+ zoneref->zone = zone;
+ zoneref->zone_idx = zone_idx(zone);
+}
+
/*
* Builds allocation fallback zone lists.
*
@@ -1906,7 +1938,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
zone_type--;
zone = pgdat->node_zones + zone_type;
if (populated_zone(zone)) {
- zonelist->zones[nr_zones++] = zone;
+ zoneref_set_zone(zone,
+ &zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
@@ -2029,6 +2062,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
int min_val = INT_MAX;
int best_node = -1;
+ node_to_cpumask_ptr(tmp, 0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
@@ -2037,7 +2071,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
}
for_each_node_state(n, N_HIGH_MEMORY) {
- cpumask_t tmp;
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -2050,8 +2083,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
val += (n < node);
/* Give preference to headless and unused nodes */
- tmp = node_to_cpumask(n);
- if (!cpus_empty(tmp))
+ node_to_cpumask_ptr_next(tmp, n);
+ if (!cpus_empty(*tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
@@ -2078,17 +2111,16 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
{
- enum zone_type i;
int j;
struct zonelist *zonelist;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + i;
- for (j = 0; zonelist->zones[j] != NULL; j++)
- ;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- zonelist->zones[j] = NULL;
- }
+ zonelist = &pgdat->node_zonelists[0];
+ for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
+ ;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
}
/*
@@ -2096,15 +2128,13 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
- enum zone_type i;
int j;
struct zonelist *zonelist;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
- j = build_zonelists_node(pgdat, zonelist, 0, i);
- zonelist->zones[j] = NULL;
- }
+ zonelist = &pgdat->node_zonelists[1];
+ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
}
/*
@@ -2117,27 +2147,26 @@ static int node_order[MAX_NUMNODES];
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
- enum zone_type i;
int pos, j, node;
int zone_type; /* needs to be signed */
struct zone *z;
struct zonelist *zonelist;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + i;
- pos = 0;
- for (zone_type = i; zone_type >= 0; zone_type--) {
- for (j = 0; j < nr_nodes; j++) {
- node = node_order[j];
- z = &NODE_DATA(node)->node_zones[zone_type];
- if (populated_zone(z)) {
- zonelist->zones[pos++] = z;
- check_highest_zone(zone_type);
- }
+ zonelist = &pgdat->node_zonelists[0];
+ pos = 0;
+ for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
+ for (j = 0; j < nr_nodes; j++) {
+ node = node_order[j];
+ z = &NODE_DATA(node)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ zoneref_set_zone(z,
+ &zonelist->_zonerefs[pos++]);
+ check_highest_zone(zone_type);
}
}
- zonelist->zones[pos] = NULL;
}
+ zonelist->_zonerefs[pos].zone = NULL;
+ zonelist->_zonerefs[pos].zone_idx = 0;
}
static int default_zonelist_order(void)
@@ -2214,7 +2243,8 @@ static void build_zonelists(pg_data_t *pgdat)
/* initialize zonelists */
for (i = 0; i < MAX_ZONELISTS; i++) {
zonelist = pgdat->node_zonelists + i;
- zonelist->zones[0] = NULL;
+ zonelist->_zonerefs[0].zone = NULL;
+ zonelist->_zonerefs[0].zone_idx = 0;
}
/* NUMA-aware ordering of nodes */
@@ -2264,19 +2294,15 @@ static void build_zonelists(pg_data_t *pgdat)
/* Construct the zonelist performance cache - see further mmzone.h */
static void build_zonelist_cache(pg_data_t *pgdat)
{
- int i;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zonelist *zonelist;
- struct zonelist_cache *zlc;
- struct zone **z;
+ struct zonelist *zonelist;
+ struct zonelist_cache *zlc;
+ struct zoneref *z;
- zonelist = pgdat->node_zonelists + i;
- zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
- bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
- for (z = zonelist->zones; *z; z++)
- zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
- }
+ zonelist = &pgdat->node_zonelists[0];
+ zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ for (z = zonelist->_zonerefs; z->zone; z++)
+ zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
}
@@ -2290,45 +2316,44 @@ static void set_zonelist_order(void)
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
- enum zone_type i,j;
+ enum zone_type j;
+ struct zonelist *zonelist;
local_node = pgdat->node_id;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zonelist *zonelist;
-
- zonelist = pgdat->node_zonelists + i;
- j = build_zonelists_node(pgdat, zonelist, 0, i);
- /*
- * Now we build the zonelist so that it contains the zones
- * of all the other nodes.
- * We don't want to pressure a particular node, so when
- * building the zones for node N, we make sure that the
- * zones coming right after the local ones are those from
- * node N+1 (modulo N)
- */
- for (node = local_node + 1; node < MAX_NUMNODES; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- }
- for (node = 0; node < local_node; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- }
+ zonelist = &pgdat->node_zonelists[0];
+ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
- zonelist->zones[j] = NULL;
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
}
+ for (node = 0; node < local_node; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
+ }
+
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
}
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
static void build_zonelist_cache(pg_data_t *pgdat)
{
- int i;
-
- for (i = 0; i < MAX_NR_ZONES; i++)
- pgdat->node_zonelists[i].zlcache_ptr = NULL;
+ pgdat->node_zonelists[0].zlcache_ptr = NULL;
+ pgdat->node_zonelists[1].zlcache_ptr = NULL;
}
#endif /* CONFIG_NUMA */
@@ -2518,7 +2543,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
+ struct zone *z;
+ z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
@@ -2536,7 +2563,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
init_page_count(page);
reset_page_mapcount(page);
SetPageReserved(page);
-
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -2545,8 +2571,15 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
*/
- if ((pfn & (pageblock_nr_pages-1)))
+ if ((z->zone_start_pfn <= pfn)
+ && (pfn < z->zone_start_pfn + z->spanned_pages)
+ && !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
@@ -4339,9 +4372,7 @@ void *__init alloc_large_system_hash(const char *tablename,
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
- unsigned long order;
- for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
- ;
+ unsigned long order = get_order(size);
table = (void*) __get_free_pages(GFP_ATOMIC, order);
/*
* If bucketsize is not a power-of-two, we may free
@@ -4460,6 +4491,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ VM_BUG_ON(pfn < zone->zone_start_pfn);
+ VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
if (flags & value)