Subject: [RFC] spufs: fix for CONFIG_NUMA From: Joel H Schopp Based on an older patch from Mike Kravetz We need to have a mem_map for high addresses in order to make fops->no_page work on spufs mem and register files. So far, we have used the memory_present() function during early bootup, but that did not work when CONFIG_NUMA was enabled. We now use the __add_pages() function to add the mem_map when loading the spufs module, which is a lot nicer. Unfortunately, the memory hot-add code is currently a little broken (tries to do bootmem_alloc and kmalloc from the same function), so I had to add an ugly hack to the common code for that, which I don't want to submit for upstream inclusion in its current form. Signed-off-by: Arnd Bergmann Index: linus-2.6/arch/powerpc/kernel/setup_64.c =================================================================== --- linus-2.6.orig/arch/powerpc/kernel/setup_64.c +++ linus-2.6/arch/powerpc/kernel/setup_64.c @@ -613,7 +613,6 @@ void __init setup_arch(char **cmdline_p) /* set up the bootmem stuff with available memory */ do_init_bootmem(); - cell_spumem_init(1); sparse_init(); #ifdef CONFIG_DUMMY_CONSOLE Index: linus-2.6/arch/powerpc/platforms/cell/setup.c =================================================================== --- linus-2.6.orig/arch/powerpc/platforms/cell/setup.c +++ linus-2.6/arch/powerpc/platforms/cell/setup.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -71,73 +73,6 @@ static void cell_show_cpuinfo(struct seq of_node_put(root); } -#ifdef CONFIG_SPARSEMEM -static int __init find_spu_node_id(struct device_node *spe) -{ - unsigned int *id; -#ifdef CONFIG_NUMA - struct device_node *cpu; - cpu = spe->parent->parent; - id = (unsigned int *)get_property(cpu, "node-id", NULL); -#else - id = NULL; -#endif - return id ? *id : 0; -} - -static void __init cell_spuprop_present(struct device_node *spe, - const char *prop, int early) -{ - struct address_prop { - unsigned long address; - unsigned int len; - } __attribute__((packed)) *p; - int proplen; - - unsigned long start_pfn, end_pfn, pfn; - int node_id; - - p = (void*)get_property(spe, prop, &proplen); - WARN_ON(proplen != sizeof (*p)); - - node_id = find_spu_node_id(spe); - - start_pfn = p->address >> PAGE_SHIFT; - end_pfn = (p->address + p->len + PAGE_SIZE - 1) >> PAGE_SHIFT; - - /* We need to call memory_present *before* the call to sparse_init, - but we can initialize the page structs only *after* that call. - Thus, we're being called twice. */ - if (early) - memory_present(node_id, start_pfn, end_pfn); - else { - /* As the pages backing SPU LS and I/O are outside the range - of regular memory, their page structs were not initialized - by free_area_init. Do it here instead. */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - struct page *page = pfn_to_page(pfn); - set_page_links(page, ZONE_DMA, node_id, pfn); - set_page_count(page, 1); - reset_page_mapcount(page); - SetPageReserved(page); - INIT_LIST_HEAD(&page->lru); - } - } -} - -void __init cell_spumem_init(int early) -{ - struct device_node *node; - for (node = of_find_node_by_type(NULL, "spe"); - node; node = of_find_node_by_type(node, "spe")) { - cell_spuprop_present(node, "local-store", early); - cell_spuprop_present(node, "problem", early); - cell_spuprop_present(node, "priv1", early); - cell_spuprop_present(node, "priv2", early); - } -} -#endif - static void cell_progress(char *s, unsigned short hex) { printk("*** %04x : %s\n", hex, s ? s : ""); @@ -201,8 +136,6 @@ static void __init cell_setup_arch(void) #endif mmio_nvram_init(); - - cell_spumem_init(0); } /* Index: linus-2.6/arch/powerpc/platforms/cell/spu_base.c =================================================================== --- linus-2.6.orig/arch/powerpc/platforms/cell/spu_base.c +++ linus-2.6/arch/powerpc/platforms/cell/spu_base.c @@ -516,6 +516,57 @@ void spu_irq_setaffinity(struct spu *spu } EXPORT_SYMBOL_GPL(spu_irq_setaffinity); +/* XXX better look for ibm,associativity properties as well */ +static int __init find_spu_node_id(struct device_node *spe) +{ + unsigned int *id; + struct device_node *cpu; + cpu = spe->parent->parent; + id = (unsigned int *)get_property(cpu, "node-id", NULL); + return id ? *id : 0; +} + +static int __init cell_spuprop_present(struct device_node *spe, + const char *prop) +{ + static DEFINE_MUTEX(add_spumem_mutex); + + struct address_prop { + unsigned long address; + unsigned int len; + } __attribute__((packed)) *p; + int proplen; + + unsigned long start_pfn, nr_pages; + int node_id; + struct pglist_data *pgdata; + struct zone *zone; + int ret; + + p = (void*)get_property(spe, prop, &proplen); + WARN_ON(proplen != sizeof (*p)); + + start_pfn = p->address >> PAGE_SHIFT; + nr_pages = ((unsigned long)p->len + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* + * XXX need to get the correct NUMA node in here. This may + * be different from the spe::node_id property, e.g. when + * the host firmware is not NUMA aware. + */ + node_id = 0; + + pgdata = NODE_DATA(node_id); + zone = pgdata->node_zones; + + /* XXX rethink locking here */ + mutex_lock(&add_spumem_mutex); + ret = __add_pages(zone, start_pfn, nr_pages); + mutex_unlock(&add_spumem_mutex); + + return ret; +} + static void __iomem * __init map_spe_prop(struct device_node *n, const char *name) { @@ -526,6 +577,8 @@ static void __iomem * __init map_spe_pro void *p; int proplen; + void* ret = NULL; + int err = 0; p = get_property(n, name, &proplen); if (proplen != sizeof (struct address_prop)) @@ -537,7 +590,14 @@ static void __iomem * __init map_spe_pro if (strcmp (name, "priv2") == 0 && prop->len < 0x20000) return ioremap(prop->address, 0x20000); - return ioremap(prop->address, prop->len); + err = cell_spuprop_present(n, name); + if (err && (err != -EEXIST)) + goto out; + + ret = ioremap(prop->address, prop->len); + + out: + return ret; } static void spu_unmap(struct spu *spu) @@ -597,17 +657,6 @@ out: return ret; } -static int __init find_spu_node_id(struct device_node *spe) -{ - unsigned int *id; - struct device_node *cpu; - - cpu = spe->parent->parent; - id = (unsigned int *)get_property(cpu, "node-id", NULL); - - return id ? *id : 0; -} - static int __init create_spu(struct device_node *spe) { struct spu *spu; Index: linus-2.6/mm/memory_hotplug.c =================================================================== --- linus-2.6.orig/mm/memory_hotplug.c +++ linus-2.6/mm/memory_hotplug.c @@ -69,12 +69,16 @@ int __add_pages(struct zone *zone, unsig for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { err = __add_section(zone, phys_start_pfn + i); - if (err) + /* We want to keep adding the rest of the + * sections if the first ones already exist + */ + if (err && (err != -EEXIST)) break; } return err; } +EXPORT_SYMBOL_GPL(__add_pages); static void grow_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) Index: linus-2.6/mm/sparse.c =================================================================== --- linus-2.6.orig/mm/sparse.c +++ linus-2.6/mm/sparse.c @@ -26,13 +26,16 @@ struct mem_section mem_section[NR_SECTIO EXPORT_SYMBOL(mem_section); #ifdef CONFIG_SPARSEMEM_EXTREME -static struct mem_section *sparse_index_alloc(int nid) +static struct mem_section *sparse_index_alloc(int nid, int late) { struct mem_section *section = NULL; unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - section = alloc_bootmem_node(NODE_DATA(nid), array_size); + if (late) + section = kmalloc_node(array_size, GFP_KERNEL, nid); + else + section = alloc_bootmem_node(NODE_DATA(nid), array_size); if (section) memset(section, 0, array_size); @@ -40,7 +43,7 @@ static struct mem_section *sparse_index_ return section; } -static int sparse_index_init(unsigned long section_nr, int nid) +static int sparse_index_init(unsigned long section_nr, int nid, int late) { static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; unsigned long root = SECTION_NR_TO_ROOT(section_nr); @@ -50,7 +53,7 @@ static int sparse_index_init(unsigned lo if (mem_section[root]) return -EEXIST; - section = sparse_index_alloc(nid); + section = sparse_index_alloc(nid, late); /* * This lock keeps two different sections from * reallocating for the same index @@ -68,7 +71,8 @@ out: return ret; } #else /* !SPARSEMEM_EXTREME */ -static inline int sparse_index_init(unsigned long section_nr, int nid) +static inline int sparse_index_init(unsigned long section_nr, int nid, + int late) { return 0; } @@ -109,7 +113,7 @@ void memory_present(int nid, unsigned lo unsigned long section = pfn_to_section_nr(pfn); struct mem_section *ms; - sparse_index_init(section, nid); + sparse_index_init(section, nid, 0); ms = __nr_to_section(section); if (!ms->section_mem_map) @@ -250,8 +254,8 @@ void sparse_init(void) /* * returns the number of sections whose mem_maps were properly - * set. If this is <=0, then that means that the passed-in - * map was not consumed and must be freed. + * set. If the return is less than 0 the section didn't get added + * -EEXIST is if it already is there */ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, int nr_pages) @@ -267,7 +271,7 @@ int sparse_add_one_section(struct zone * * no locking for this, because it does its own * plus, it does a kmalloc */ - sparse_index_init(section_nr, pgdat->node_id); + sparse_index_init(section_nr, pgdat->node_id, 1); memmap = __kmalloc_section_memmap(nr_pages); pgdat_resize_lock(pgdat, &flags); @@ -281,9 +285,10 @@ int sparse_add_one_section(struct zone * ret = sparse_init_one_section(ms, section_nr, memmap); +out: if (ret <= 0) __kfree_section_memmap(memmap, nr_pages); -out: + pgdat_resize_unlock(pgdat, &flags); return ret; } Index: linus-2.6/arch/powerpc/platforms/cell/Kconfig =================================================================== --- linus-2.6.orig/arch/powerpc/platforms/cell/Kconfig +++ linus-2.6/arch/powerpc/platforms/cell/Kconfig @@ -22,7 +22,7 @@ config SPU_FS config SPUFS_MMAP bool - depends on SPU_FS && SPARSEMEM && !PPC_64K_PAGES + depends on SPU_FS && SPARSEMEM && MEMORY_HOTPLUG default y endmenu