From: linas@austin.ibm.com (Linas Vepstas) The current PCI error recovery system keeps track of the number of PCI card resets, and refuses to bring a card back up if this number is too large. The goal of doing this was to avoid an infinite loop of resets if a card is obviously dead. However, if the failures are rare, but the machine has a high uptime, this mechanism might still be triggered; this is too harsh. This patch will avoids this problem by decrementing the fail count after an hour. Thus, as long as a pci card BSOD's less than 6 times an hour, it will continue to be reset indefinitely. If it's failure rate is greater than that, it will be taken off-line permanently. This patch is larger than it might otherwise be because it changes indentation by removing a pointless while-loop. The while loop is not needed, as the handler is invoked once fo each event (by schedule_work()); the loop is leftover cruft from an earlier implementation. Signed-off-by: Linas Vepstas Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/eeh_driver.c | 13 ++- arch/powerpc/platforms/pseries/eeh_event.c | 60 +++++++++--------- include/asm-powerpc/eeh_event.h | 10 +-- 3 files changed, 45 insertions(+), 38 deletions(-) diff -puN arch/powerpc/platforms/pseries/eeh_driver.c~powerpc-pseries-clear-pci-failure-counter-if-no-new-failures arch/powerpc/platforms/pseries/eeh_driver.c --- devel/arch/powerpc/platforms/pseries/eeh_driver.c~powerpc-pseries-clear-pci-failure-counter-if-no-new-failures 2006-04-07 21:52:25.000000000 -0700 +++ devel-akpm/arch/powerpc/platforms/pseries/eeh_driver.c 2006-04-07 21:52:25.000000000 -0700 @@ -23,9 +23,8 @@ * */ #include -#include #include -#include +#include #include #include #include @@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_ */ #define MAX_WAIT_FOR_RECOVERY 15 -void handle_eeh_events (struct eeh_event *event) +struct pci_dn * handle_eeh_events (struct eeh_event *event) { struct device_node *frozen_dn; struct pci_dn *frozen_pdn; @@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event if (!frozen_dn) { printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n", pci_name(event->dev)); - return; + return NULL; } /* There are two different styles for coming up with the PE. @@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event if (!frozen_bus) { printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n", frozen_dn->full_name); - return; + return NULL; } #if 0 @@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event /* Tell all device drivers that they can resume operations */ pci_walk_bus(frozen_bus, eeh_report_resume, NULL); - return; + return frozen_pdn; excess_failures: /* @@ -384,6 +383,8 @@ perm_error: /* Shut down the device drivers for good. */ pcibios_remove_pci_devices(frozen_bus); + + return NULL; } /* ---------- end of file ---------- */ diff -puN arch/powerpc/platforms/pseries/eeh_event.c~powerpc-pseries-clear-pci-failure-counter-if-no-new-failures arch/powerpc/platforms/pseries/eeh_event.c --- devel/arch/powerpc/platforms/pseries/eeh_event.c~powerpc-pseries-clear-pci-failure-counter-if-no-new-failures 2006-04-07 21:52:25.000000000 -0700 +++ devel-akpm/arch/powerpc/platforms/pseries/eeh_event.c 2006-04-07 21:52:25.000000000 -0700 @@ -18,6 +18,7 @@ * Copyright (c) 2005 Linas Vepstas */ +#include #include #include #include @@ -56,38 +57,43 @@ static int eeh_event_handler(void * dumm { unsigned long flags; struct eeh_event *event; + struct pci_dn *pdn; daemonize ("eehd"); + set_current_state(TASK_INTERRUPTIBLE); - while (1) { - set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + + /* Unqueue the event, get ready to process. */ + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - spin_lock_irqsave(&eeh_eventlist_lock, flags); - event = NULL; + if (event == NULL) + return 0; - /* Unqueue the event, get ready to process. */ - if (!list_empty(&eeh_eventlist)) { - event = list_entry(eeh_eventlist.next, struct eeh_event, list); - list_del(&event->list); - } - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - - if (event == NULL) - break; - - /* Serialize processing of EEH events */ - mutex_lock(&eeh_event_mutex); - eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); - - printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", - pci_name(event->dev)); - - handle_eeh_events(event); - - eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); - pci_dev_put(event->dev); - kfree(event); - mutex_unlock(&eeh_event_mutex); + /* Serialize processing of EEH events */ + mutex_lock(&eeh_event_mutex); + eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); + + printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", + pci_name(event->dev)); + + pdn = handle_eeh_events(event); + + eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); + pci_dev_put(event->dev); + kfree(event); + mutex_unlock(&eeh_event_mutex); + + /* If there are no new errors after an hour, clear the counter. */ + if (pdn && pdn->eeh_freeze_count>0) { + msleep_interruptible (3600*1000); + if (pdn->eeh_freeze_count>0) + pdn->eeh_freeze_count--; } return 0; diff -puN include/asm-powerpc/eeh_event.h~powerpc-pseries-clear-pci-failure-counter-if-no-new-failures include/asm-powerpc/eeh_event.h --- devel/include/asm-powerpc/eeh_event.h~powerpc-pseries-clear-pci-failure-counter-if-no-new-failures 2006-04-07 21:52:25.000000000 -0700 +++ devel-akpm/include/asm-powerpc/eeh_event.h 2006-04-07 21:52:25.000000000 -0700 @@ -18,8 +18,8 @@ * Copyright (c) 2005 Linas Vepstas */ -#ifndef ASM_PPC64_EEH_EVENT_H -#define ASM_PPC64_EEH_EVENT_H +#ifndef ASM_POWERPC_EEH_EVENT_H +#define ASM_POWERPC_EEH_EVENT_H #ifdef __KERNEL__ /** EEH event -- structure holding pci controller data that describes @@ -39,7 +39,7 @@ struct eeh_event { * @dev pci device * * This routine builds a PCI error event which will be delivered - * to all listeners on the peh_notifier_chain. + * to all listeners on the eeh_notifier_chain. * * This routine can be called within an interrupt context; * the actual event will be delivered in a normal context @@ -51,7 +51,7 @@ int eeh_send_failure_event (struct devic int time_unavail); /* Main recovery function */ -void handle_eeh_events (struct eeh_event *); +struct pci_dn * handle_eeh_events (struct eeh_event *); #endif /* __KERNEL__ */ -#endif /* ASM_PPC64_EEH_EVENT_H */ +#endif /* ASM_POWERPC_EEH_EVENT_H */ _