Subject: spufs: make spu page faults not block scheduling From: Arnd Bergmann Until now, we have always entered the spu page fault handler with a mutex for the spu context held. This has multiple bad side-effects: - it becomes impossible to suspend the context during page faults - if an spu program attempts to access its own mmio areas through DMA, we get an immediate livelock when the nopage function tries to acquire the same mutex This patch makes the page fault logic operate on a struct spu_context instead of a struct spu, and moves it from spu_base.c to a new file fault.c inside of spufs. We now also need to copy the dar and dsisr contents of the last fault into the saved context to have it accessible in case we schedule out the context before activating the page fault handler. Signed-off-by: Arnd Bergmann Index: linux-2.6/arch/powerpc/platforms/cell/spu_base.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spu_base.c +++ linux-2.6/arch/powerpc/platforms/cell/spu_base.c @@ -287,7 +287,6 @@ spu_irq_class_1(int irq, void *data) return stat ? IRQ_HANDLED : IRQ_NONE; } -EXPORT_SYMBOL_GPL(spu_irq_class_1_bottom); static irqreturn_t spu_irq_class_2(int irq, void *data) @@ -459,108 +458,6 @@ void spu_free(struct spu *spu) } EXPORT_SYMBOL_GPL(spu_free); -static int spu_handle_mm_fault(struct spu *spu) -{ - struct mm_struct *mm = spu->mm; - struct vm_area_struct *vma; - u64 ea, dsisr, is_write; - int ret; - - ea = spu->dar; - dsisr = spu->dsisr; -#if 0 - if (!IS_VALID_EA(ea)) { - return -EFAULT; - } -#endif /* XXX */ - if (mm == NULL) { - return -EFAULT; - } - if (mm->pgd == NULL) { - return -EFAULT; - } - - down_read(&mm->mmap_sem); - vma = find_vma(mm, ea); - if (!vma) - goto bad_area; - if (vma->vm_start <= ea) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; -#if 0 - if (expand_stack(vma, ea)) - goto bad_area; -#endif /* XXX */ -good_area: - is_write = dsisr & MFC_DSISR_ACCESS_PUT; - if (is_write) { - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - } else { - if (dsisr & MFC_DSISR_ACCESS_DENIED) - goto bad_area; - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) - goto bad_area; - } - ret = 0; - switch (handle_mm_fault(mm, vma, ea, is_write)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - case VM_FAULT_SIGBUS: - ret = -EFAULT; - goto bad_area; - case VM_FAULT_OOM: - ret = -ENOMEM; - goto bad_area; - default: - BUG(); - } - up_read(&mm->mmap_sem); - return ret; - -bad_area: - up_read(&mm->mmap_sem); - return -EFAULT; -} - -int spu_irq_class_1_bottom(struct spu *spu) -{ - u64 ea, dsisr, access, error = 0UL; - int ret = 0; - - ea = spu->dar; - dsisr = spu->dsisr; - if (dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED)) { - u64 flags; - - access = (_PAGE_PRESENT | _PAGE_USER); - access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL; - local_irq_save(flags); - if (hash_page(ea, access, 0x300) != 0) - error |= CLASS1_ENABLE_STORAGE_FAULT_INTR; - local_irq_restore(flags); - } - if (error & CLASS1_ENABLE_STORAGE_FAULT_INTR) { - if ((ret = spu_handle_mm_fault(spu)) != 0) - error |= CLASS1_ENABLE_STORAGE_FAULT_INTR; - else - error &= ~CLASS1_ENABLE_STORAGE_FAULT_INTR; - } - spu->dar = 0UL; - spu->dsisr = 0UL; - if (!error) { - spu_restart_dma(spu); - } else { - spu->dma_callback(spu, SPE_EVENT_SPE_DATA_STORAGE); - } - return ret; -} - struct sysdev_class spu_sysdev_class = { set_kset_name("spu") }; Index: linux-2.6/arch/powerpc/platforms/cell/spufs/backing_ops.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/backing_ops.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/backing_ops.c @@ -350,6 +350,11 @@ static int spu_backing_send_mfc_command( return ret; } +static void spu_backing_restart_dma(struct spu_context *ctx) +{ + /* nothing to do here */ +} + struct spu_context_ops spu_backing_ops = { .mbox_read = spu_backing_mbox_read, .mbox_stat_read = spu_backing_mbox_stat_read, @@ -376,4 +381,5 @@ struct spu_context_ops spu_backing_ops = .read_mfc_tagstatus = spu_backing_read_mfc_tagstatus, .get_mfc_free_elements = spu_backing_get_mfc_free_elements, .send_mfc_command = spu_backing_send_mfc_command, + .restart_dma = spu_backing_restart_dma, }; Index: linux-2.6/arch/powerpc/platforms/cell/spufs/hw_ops.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/hw_ops.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/hw_ops.c @@ -296,6 +296,14 @@ static int spu_hw_send_mfc_command(struc } } +static void spu_hw_restart_dma(struct spu_context *ctx) +{ + struct spu_priv2 __iomem *priv2 = ctx->spu->priv2; + + if (!test_bit(SPU_CONTEXT_SWITCH_PENDING, &ctx->spu->flags)) + out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND); +} + struct spu_context_ops spu_hw_ops = { .mbox_read = spu_hw_mbox_read, .mbox_stat_read = spu_hw_mbox_stat_read, @@ -320,4 +328,5 @@ struct spu_context_ops spu_hw_ops = { .read_mfc_tagstatus = spu_hw_read_mfc_tagstatus, .get_mfc_free_elements = spu_hw_get_mfc_free_elements, .send_mfc_command = spu_hw_send_mfc_command, + .restart_dma = spu_hw_restart_dma, }; Index: linux-2.6/arch/powerpc/platforms/cell/spufs/run.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/run.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/run.c @@ -18,27 +18,6 @@ void spufs_stop_callback(struct spu *spu wake_up_all(&ctx->stop_wq); } -void spufs_dma_callback(struct spu *spu, int type) -{ - struct spu_context *ctx = spu->ctx; - - if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) { - ctx->event_return |= type; - wake_up_all(&ctx->stop_wq); - } else { - switch (type) { - case SPE_EVENT_DMA_ALIGNMENT: - case SPE_EVENT_SPE_DATA_STORAGE: - case SPE_EVENT_INVALID_DMA: - force_sig(SIGBUS, /* info, */ current); - break; - case SPE_EVENT_SPE_ERROR: - force_sig(SIGILL, /* info */ current); - break; - } - } -} - static inline int spu_stopped(struct spu_context *ctx, u32 * stat) { struct spu *spu; @@ -292,11 +271,8 @@ int spu_process_callback(struct spu_cont static inline int spu_process_events(struct spu_context *ctx) { struct spu *spu = ctx->spu; - u64 pte_fault = MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED; int ret = 0; - if (spu->dsisr & pte_fault) - ret = spu_irq_class_1_bottom(spu); if (spu->class_0_pending) ret = spu_irq_class_0_bottom(spu); if (!ret && signal_pending(current)) @@ -330,6 +306,10 @@ long spufs_run_spu(struct file *file, st break; status &= ~SPU_STATUS_STOPPED_BY_STOP; } + ret = spufs_handle_class1(ctx); + if (ret) + break; + if (unlikely(ctx->state != SPU_STATE_RUNNABLE)) { ret = spu_reacquire_runnable(ctx, npc, &status); if (ret) { Index: linux-2.6/arch/powerpc/platforms/cell/spufs/spufs.h =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/spufs.h +++ linux-2.6/arch/powerpc/platforms/cell/spufs/spufs.h @@ -141,6 +141,7 @@ struct spu_context_ops { struct spu_dma_info * info); void (*proxydma_info_read) (struct spu_context * ctx, struct spu_proxydma_info * info); + void (*restart_dma)(struct spu_context *ctx); }; extern struct spu_context_ops spu_hw_ops; @@ -172,6 +173,9 @@ int put_spu_gang(struct spu_gang *gang); void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx); void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx); +/* fault handling */ +int spufs_handle_class1(struct spu_context *ctx); + /* context management */ static inline void spu_acquire(struct spu_context *ctx) { Index: linux-2.6/arch/powerpc/platforms/cell/spufs/switch.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/switch.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/switch.c @@ -2084,6 +2084,10 @@ int spu_save(struct spu_state *prev, str int rc; acquire_spu_lock(spu); /* Step 1. */ + prev->dar = spu->dar; + prev->dsisr = spu->dsisr; + spu->dar = 0; + spu->dsisr = 0; rc = __do_spu_save(prev, spu); /* Steps 2-53. */ release_spu_lock(spu); if (rc != 0 && rc != 2 && rc != 6) { @@ -2109,9 +2113,9 @@ int spu_restore(struct spu_state *new, s acquire_spu_lock(spu); harvest(NULL, spu); - spu->dar = 0; - spu->dsisr = 0; spu->slb_replace = 0; + new->dar = 0; + new->dsisr = 0; spu->class_0_pending = 0; rc = __do_spu_restore(new, spu); release_spu_lock(spu); Index: linux-2.6/include/asm-powerpc/spu_csa.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/spu_csa.h +++ linux-2.6/include/asm-powerpc/spu_csa.h @@ -242,6 +242,7 @@ struct spu_state { u64 spu_chnldata_RW[32]; u32 spu_mailbox_data[4]; u32 pu_mailbox_data[1]; + u64 dar, dsisr; unsigned long suspend_time; spinlock_t register_lock; }; Index: linux-2.6/arch/powerpc/platforms/cell/spufs/Makefile =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/Makefile +++ linux-2.6/arch/powerpc/platforms/cell/spufs/Makefile @@ -3,6 +3,7 @@ obj-y += switch.o obj-$(CONFIG_SPU_FS) += spufs.o spufs-y += inode.o file.o context.o syscalls.o coredump.o spufs-y += sched.o backing_ops.o hw_ops.o run.o gang.o +spufs-y += fault.o # Rules to build switch.o with the help of SPU tool chain SPU_CROSS := spu- Index: linux-2.6/include/asm-powerpc/mmu.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/mmu.h +++ linux-2.6/include/asm-powerpc/mmu.h @@ -234,6 +234,7 @@ extern int __hash_page_64K(unsigned long unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned int local); struct mm_struct; +extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap); extern int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local, unsigned long trap); Index: linux-2.6/arch/powerpc/platforms/cell/spufs/fault.c =================================================================== --- /dev/null +++ linux-2.6/arch/powerpc/platforms/cell/spufs/fault.c @@ -0,0 +1,192 @@ +/* + * Low-level SPU handling + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2005 + * + * Author: Arnd Bergmann + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include + +#include +#include + +#include "spufs.h" + +/* + * This ought to be kept in sync with the powerpc specific do_page_fault + * function. Currently, there are a few corner cases that we haven't had + * to handle fortunately. + */ +static int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea, unsigned long dsisr) +{ + struct vm_area_struct *vma; + unsigned long is_write; + int ret; + +#if 0 + if (!IS_VALID_EA(ea)) { + return -EFAULT; + } +#endif /* XXX */ + if (mm == NULL) { + return -EFAULT; + } + if (mm->pgd == NULL) { + return -EFAULT; + } + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ea); + if (!vma) + goto bad_area; + if (vma->vm_start <= ea) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; +#if 0 /* needs to be exported first */ + if (expand_stack(vma, ea)) + goto bad_area; +#endif +good_area: + is_write = dsisr & MFC_DSISR_ACCESS_PUT; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + } else { + if (dsisr & MFC_DSISR_ACCESS_DENIED) + goto bad_area; + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; + } + ret = 0; + switch (handle_mm_fault(mm, vma, ea, is_write)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + case VM_FAULT_SIGBUS: + ret = -EFAULT; + goto bad_area; + case VM_FAULT_OOM: + ret = -ENOMEM; + goto bad_area; + default: + BUG(); + } + up_read(&mm->mmap_sem); + return ret; + +bad_area: + up_read(&mm->mmap_sem); + return -EFAULT; +} + +static void spufs_handle_dma_error(struct spu_context *ctx, int type) +{ + if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) { + ctx->event_return |= type; + wake_up_all(&ctx->stop_wq); + } else { + switch (type) { + case SPE_EVENT_DMA_ALIGNMENT: + case SPE_EVENT_SPE_DATA_STORAGE: + case SPE_EVENT_INVALID_DMA: + force_sig(SIGBUS, /* info, */ current); + break; + case SPE_EVENT_SPE_ERROR: + force_sig(SIGILL, /* info */ current); + break; + } + } +} + +void spufs_dma_callback(struct spu *spu, int type) +{ + spufs_handle_dma_error(spu->ctx, type); +} + +/* + * bottom half handler for page faults, we can't do this from + * interrupt context, since we might need to sleep. + * we also need to give up the mutex so we can get scheduled + * out while waiting for the backing store. + * + * TODO: try calling hash_page from the interrupt handler first + * in order to speed up the easy case. + */ +int spufs_handle_class1(struct spu_context *ctx) +{ + u64 ea, dsisr, access; + unsigned long flags; + int ret; + + /* + * dar and dsisr get passed from the registers + * to the spu_context, to this function, but not + * back to the spu if it gets scheduled again. + * + * if we don't handle the fault for a saved context + * in time, we can still expect to get the same fault + * the immediately after the context restore. + */ + if (ctx->state == SPU_STATE_RUNNABLE) { + ea = ctx->spu->dar; + dsisr = ctx->spu->dsisr; + ctx->spu->dar= ctx->spu->dsisr = 0; + } else { + ea = ctx->csa.priv1.mfc_dar_RW; + dsisr = ctx->csa.priv1.mfc_dsisr_RW; + ctx->csa.priv1.mfc_dar_RW = 0; + ctx->csa.priv1.mfc_dsisr_RW = 0; + } + + if (!(dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED))) + return 0; + + pr_debug("ctx %p: ea %016lx, dsisr %016lx state %d\n", ctx, ea, + dsisr, ctx->state); + + /* we must not hold the lock when entering spu_handle_mm_fault */ + spu_release(ctx); + + access = (_PAGE_PRESENT | _PAGE_USER); + access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL; + local_irq_save(flags); + ret = hash_page(ea, access, 0x300); + local_irq_restore(flags); + + /* hashing failed, so try the actual fault handler */ + if (ret) + ret = spu_handle_mm_fault(current->mm, ea, dsisr); + + spu_acquire(ctx); + /* + * If we handled the fault successfully and are in runnable + * state, restart the DMA. + * In case of unhandled error report the problem to user space. + */ + if (!ret) { + if (ctx->spu) + ctx->ops->restart_dma(ctx); + } else + spufs_handle_dma_error(ctx, SPE_EVENT_SPE_DATA_STORAGE); + + return ret; +}