Actual source code: xmllogevent.c

petsc-3.8.3 2017-12-09
Report Typos and Errors
  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8:  #include <petsclog.h>
  9:  #include <petsc/private/logimpl.h>
 10:  #include <petsctime.h>
 11:  #include <petscviewer.h>
 12: #include "../src/sys/logging/xmllogevent.h"
 13: #include "../src/sys/logging/xmlviewer.h"

 15: #if defined(PETSC_USE_LOG)

 17: /*
 18:  * Support for nested PetscTimers
 19:  *
 20:  * PetscTimers keep track of a lot of useful information: Wall clock times, 
 21:  * message passing statistics, flop counts.  Information about the nested structure 
 22:  * of the timers is lost. Example: 
 23:  *
 24:  * 7:30   Start: awake
 25:  * 7:30      Start: morning routine
 26:  * 7:40         Start: eat
 27:  * 7:49         Done:  eat
 28:  * 7:43      Done:  morning routine
 29:  * 8:15      Start: work
 30:  * 12:15        Start: eat
 31:  * 12:45        Done:  eat
 32:  * 16:00     Done:  work
 33:  * 16:30     Start: evening routine
 34:  * 18:30        Start: eat
 35:  * 19:15        Done:  eat
 36:  * 22:00     Done:  evening routine
 37:  * 22:00  Done:  awake
 38:  *
 39:  * Petsc timers provide the following timer results:
 40:  *
 41:  *    awake:              1 call    14:30 hours
 42:  *    morning routine:    1 call     0:13 hours
 43:  *    eat:                3 calls    1:24 hours
 44:  *    work:               1 call     7:45 hours
 45:  *    evening routine     1 call     5:30 hours
 46:  *
 47:  * Nested timers can be used to get the following table:
 48:  *
 49:  *   [1 call]: awake                14:30 hours
 50:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 51:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 52:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 53:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 54:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 55:  *                   rest (work)               7:15 hours         (94 % of work)
 56:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 57:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 58:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 59:  *
 60:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least, 
 61:  * the nested timers make the stages unnecessary.
 62:  *
 63:  */

 65: /*
 66:  * Data structures for keeping track of nested timers:
 67:  *
 68:  *   nestedEvents: information about the timers that have actually been activated
 69:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 70:  *
 71:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to 
 72:  * (one or more) default timers, where one of the default timers has the same event-id as the 
 73:  * nested one.
 74:  *
 75:  * Because of the risk of confusion between nested timer ids and default timer ids, we 
 76:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent 
 77:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 78:  * default timers with 'dft'.
 79:  */

 81: #define DFT_ID_AWAKE -1

 83: typedef PetscLogEvent NestedEventId;
 84: typedef struct {
 85:   NestedEventId   nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 86:   int             nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 87:   PetscLogEvent  *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 88:   PetscLogEvent  *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 90:   PetscLogEvent  *dftParents;       /* The default timers which were the dftParentActive when this nested event was started */
 91:   PetscLogEvent  *dftEventsSorted;  /* The default timers which represent the different 'instances' of this nested event */
 92: } PetscNestedEvent;

 94: static PetscLogEvent    dftParentActive  = DFT_ID_AWAKE;
 95: static int              nNestedEvents           = 0;
 96: static int              nNestedEventsAllocated  = 0;
 97: static PetscNestedEvent *nestedEvents = NULL;
 98: static PetscLogDouble   threshTime      = 0.01; /* initial value was .1 */

100: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
101: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);

103: PetscErrorCode PetscLogNestedBegin(void)
104: {
105:   PetscErrorCode    ierr;
107:   if (nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents already allocated");

109:   nNestedEventsAllocated=10;
110:   PetscMalloc1(nNestedEventsAllocated,&nestedEvents);


113:   dftParentActive = DFT_ID_AWAKE;
114:   nNestedEvents =1;

116:   /* 'Awake' is nested event 0. It has no parents */
117:   nestedEvents[0].nstEvent          = 0;
118:   nestedEvents[0].nParents          = 0;
119:   nestedEvents[0].dftParentsSorted  = NULL;
120:   nestedEvents[0].dftEvents         = NULL;
121:   nestedEvents[0].dftParents        = NULL;
122:   nestedEvents[0].dftEventsSorted   = NULL;

124:   PetscLogSet(PetscLogEventBeginNested, PetscLogEventEndNested);
125:   return(0);
126: }

128: /* Delete the data structures for the nested timers */
129: PetscErrorCode PetscLogNestedEnd(void)
130: {
131:   PetscErrorCode    ierr;
132:   int               i;

135:   if (!nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents does not exist");

137:   for (i=0; i<nNestedEvents; i++) {
138:     PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
139:   }
140:   PetscFree(nestedEvents);
141:   nestedEvents           = NULL;
142:   nNestedEvents          = 0;
143:   nNestedEventsAllocated = 0;
144:   return(0);
145: }


148: /*
149:  * UTILITIES: FIND STUFF IN SORTED ARRAYS
150:  *
151:  * Utility: find a default timer in a sorted array */
152: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,    /* index to be found */
153:                                                     const PetscLogEvent *dftArray,  /* sorted array of PetscLogEvent-ids */
154:                                                     int narray,   /* dimension of dftArray */
155:                                                     int *entry)         /* entry in the array where dftIndex may be found; 
156:                                                                             *   if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray  
157:                                                                             *   In that case, the dftIndex can be inserted at this entry. */
158: {
160:   if (narray==0 || dftIndex <= dftArray[0]) {
161:     *entry = 0;
162:   } else if (dftIndex > dftArray[narray-1]) {
163:     *entry = narray;
164:   } else {
165:     int ihigh=narray-1,  ilow=0;
166:     while (ihigh>ilow) {
167:       const int imiddle = (ihigh+ilow)/2;
168:       if (dftArray[imiddle] > dftIndex) {
169:         ihigh=imiddle;
170:       } else if (dftArray[imiddle]<dftIndex) {
171:         ilow =imiddle+1;
172:       } else {
173:         ihigh=imiddle;
174:         ilow =imiddle;
175:       }
176:     }
177:     *entry = ihigh;
178:   }
179:   return(0);
180: }

182: /* Utility: find the nested event with given identification */
183: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent, /* Nested event to be found */
184:                                                    int *entry)          /* entry in the nestedEvents where nstEvent may be found;
185:                                                                               if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray  */
186: {

189:   if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
190:     *entry = 0;
191:   } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
192:     *entry = nNestedEvents;
193:   } else {
194:     int ihigh=nNestedEvents-1,  ilow=0;
195:     while (ihigh>ilow) {
196:       const int imiddle = (ihigh+ilow)/2;
197:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
198:         ihigh=imiddle;
199:       } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
200:         ilow =imiddle+1;
201:       } else {
202:         ihigh=imiddle;
203:         ilow =imiddle;
204:       }
205:     }
206:     *entry = ihigh;
207:   }
208:   return(0);
209: }

211: /******************************************************************************************/
212: /* Start a nested event */
213: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
214: {
215:   PetscErrorCode  ierr;
216:   int             entry, pentry, tentry,i;
217:   PetscLogEvent   dftEvent;

220:   PetscLogEventFindNestedTimer(nstEvent, &entry);
221:   if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
222:     /* Nested event doesn't exist yet: create it */

224:     if (nNestedEvents==nNestedEventsAllocated) {
225:       /* Enlarge and re-allocate nestedEvents if needed */
226:       PetscNestedEvent *tmp = nestedEvents;
227:       PetscMalloc1(2*nNestedEvents,&nestedEvents);
228:       nNestedEventsAllocated*=2;
229:       PetscMemcpy(nestedEvents, tmp, nNestedEvents*sizeof(PetscNestedEvent));
230:       PetscFree(tmp);
231:     }

233:     /* Clear space in nestedEvents for new nested event */
234:     nNestedEvents++;
235:     for (i = nNestedEvents-1; i>entry; i--) {
236:       nestedEvents[i] = nestedEvents[i-1];
237:     }

239:     /* Create event in nestedEvents */
240:     nestedEvents[entry].nstEvent = nstEvent;
241:     nestedEvents[entry].nParents=1;
242:     PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);

244:     /* Fill in new event */
245:     pentry = 0;
246:     dftEvent = (PetscLogEvent) nstEvent;

248:     nestedEvents[entry].nstEvent                 = nstEvent;
249:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
250:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
251:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
252:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

254:   } else {
255:     /* Nested event exists: find current dftParentActive among parents */
256:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
257:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
258:     int           nParents          = nestedEvents[entry].nParents;

260:     PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);

262:     if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
263:       /* dftParentActive not in the list: add it to the list */
264:       int           i;
265:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
266:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
267:       char          name[100];

269:       /* Register a new default timer */
270:       sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
271:       PetscLogEventRegister(name, 0, &dftEvent);
272:       PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);

274:       /* Reallocate parents and dftEvents to make space for new parent */
275:       PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
276:       PetscMemcpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents*sizeof(PetscLogEvent));
277:       PetscMemcpy(nestedEvents[entry].dftEventsSorted,  dftEventsSorted,  nParents*sizeof(PetscLogEvent));
278:       PetscMemcpy(nestedEvents[entry].dftParents,       dftParents,       nParents*sizeof(PetscLogEvent));
279:       PetscMemcpy(nestedEvents[entry].dftEvents,        dftEvents,        nParents*sizeof(PetscLogEvent));
280:       PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);

282:       dftParents       = nestedEvents[entry].dftParents;
283:       dftEvents        = nestedEvents[entry].dftEvents;
284:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
285:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

287:       nestedEvents[entry].nParents++;
288:       nParents++;

290:       for (i = nParents-1; i>pentry; i--) {
291:         dftParentsSorted[i] = dftParentsSorted[i-1];
292:         dftEvents[i]        = dftEvents[i-1];
293:       }
294:       for (i = nParents-1; i>tentry; i--) {
295:         dftParents[i]      = dftParents[i-1];
296:         dftEventsSorted[i] = dftEventsSorted[i-1];
297:       }

299:       /* Fill in the new default timer */
300:       dftParentsSorted[pentry] = dftParentActive;
301:       dftEvents[pentry]        = dftEvent;
302:       dftParents[tentry]       = dftParentActive;
303:       dftEventsSorted[tentry]  = dftEvent;

305:     } else {
306:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
307:       dftEvent = nestedEvents[entry].dftEvents[pentry];
308:     }
309:   }

311:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
312:   PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
313:   dftParentActive = dftEvent;
314:   return(0);
315: }

317: /* End a nested event */
318: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
319: {
320:   PetscErrorCode  ierr;
321:   int             entry, pentry, nParents;
322:   PetscLogEvent  *dftEventsSorted;

325:   /* Find the nested event */
326:   PetscLogEventFindNestedTimer(nstEvent, &entry);
327:   if (entry>=nNestedEvents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d",entry,nNestedEvents);
328:   if (nestedEvents[entry].nstEvent != nstEvent) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d",entry,nstEvent);
329:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
330:   nParents        = nestedEvents[entry].nParents;

332:   /* Find the current default timer among the 'dftEvents' of this event */
333:   PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);

335:   if (pentry>=nParents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d",pentry,nParents);
336:   if (dftEventsSorted[pentry] != dftParentActive) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d",dftParentActive,dftEventsSorted[pentry]);

338:   /* Stop the default timer and update the dftParentActive */
339:   PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
340:   dftParentActive = nestedEvents[entry].dftParents[pentry];
341:   return(0);
342: }

344: /* Set the threshold time for logging the events 
345:  */
346: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
347: {
349:   *oldThresh = threshTime;
350:   threshTime = newThresh;
351:   return(0);
352: }

354: static PetscErrorCode  PetscPrintExeSpecs(PetscViewer viewer)
355: {
356:   PetscErrorCode     ierr;
357:   char               arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
358:   char               version[256], buildoptions[128];
359:   PetscMPIInt        size;
360:   MPI_Comm           comm;
361:   size_t             len;
362: 
364:   PetscObjectGetComm((PetscObject)viewer,&comm);
365:   PetscGetArchType(arch,sizeof(arch));
366:   PetscGetHostName(hostname,sizeof(hostname));
367:   PetscGetUserName(username,sizeof(username));
368:   PetscGetProgramName(pname,sizeof(pname));
369:   PetscGetDate(date,sizeof(date));
370:   PetscGetVersion(version,sizeof(version));
371:   MPI_Comm_size(comm, &size);

373:   PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
374:   PetscViewerXMLPutString(   viewer, "executable"  , "Executable"   , pname );
375:   PetscViewerXMLPutString(   viewer, "architecture", "Architecture" , arch );
376:   PetscViewerXMLPutString(   viewer, "hostname"    , "Host"         , hostname);
377:   PetscViewerXMLPutInt(      viewer, "nprocesses"  , "Number of processes", size );
378:   PetscViewerXMLPutString(   viewer, "user"        , "Run by user"  , username);
379:   PetscViewerXMLPutString(   viewer, "date"        , "Started at"   , date);
380:   PetscViewerXMLPutString(   viewer, "petscrelease", "Petsc Release", version);
381: #if defined(PETSC_USE_DEBUG)
382:   sprintf(buildoptions, "Debug");
383: #else
384:   buildoptions[0] = 0;
385: #endif
386:   PetscStrlen(buildoptions,&len);
387:   if (len) {
388:     PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
389:   }
390:   PetscViewerXMLEndSection(viewer, "runspecification");
391:   return(0);
392: }

394: /* Print the global performance: max, max/min, average and total of 
395:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
396:  */
397: static PetscErrorCode  PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble max, PetscLogDouble ratio, PetscLogDouble avg, PetscLogDouble tot)
398: {

402:   PetscViewerXMLStartSection(viewer, name, desc);
403:   PetscViewerXMLPutDouble(viewer, "max", NULL, max, "%e");
404:   PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
405:   if (avg>-1.0) {
406:     PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
407:   }
408:   if (tot>-1.0) {
409:     PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
410:   }
411:   PetscViewerXMLEndSection(viewer, name);
412:   return(0);
413: }

415: /* Print the global performance: max, max/min, average and total of 
416:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
417:  */
418: static PetscErrorCode  PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
419: {
420:   PetscErrorCode     ierr;
421:   PetscLogDouble     min, max, tot, ratio, avg;
422:   PetscLogDouble     flops, mem, red, mess;
423:   PetscMPIInt        size;
424:   MPI_Comm           comm;

427:   PetscObjectGetComm((PetscObject)viewer,&comm);
428:   MPI_Comm_size(comm, &size);

430:   /* Must preserve reduction count before we go on */
431:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

433:   /* Calculate summary information */
434:   PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");

436:   /*   Time */
437:   MPIU_Allreduce(&locTotalTime, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
438:   MPIU_Allreduce(&locTotalTime, &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
439:   MPIU_Allreduce(&locTotalTime, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
440:   avg  = (tot)/((PetscLogDouble) size);
441:   if (min != 0.0) ratio = max/min;
442:   else ratio = 0.0;
443:   PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", max, ratio, avg, -1.0);

445:   /*   Objects */
446:   avg  = (PetscLogDouble) petsc_numObjects;
447:   MPIU_Allreduce(&avg,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
448:   MPIU_Allreduce(&avg,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
449:   MPIU_Allreduce(&avg,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
450:   avg  = (tot)/((PetscLogDouble) size);
451:   if (min != 0.0) ratio = max/min;
452:   else ratio = 0.0;
453:   PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", max, ratio, avg, -1.0);

455:   /*   Flop */
456:   MPIU_Allreduce(&petsc_TotalFlops,  &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
457:   MPIU_Allreduce(&petsc_TotalFlops,  &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
458:   MPIU_Allreduce(&petsc_TotalFlops,  &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
459:   avg  = (tot)/((PetscLogDouble) size);
460:   if (min != 0.0) ratio = max/min;
461:   else ratio = 0.0;
462:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", max/1.0E6, ratio, avg/1.0E6, tot/1.0E6);

464:   /*   Flop/sec -- Must talk to Barry here */
465:   if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
466:   else flops = 0.0;
467:   MPIU_Allreduce(&flops,        &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
468:   MPIU_Allreduce(&flops,        &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
469:   MPIU_Allreduce(&flops,        &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
470:   avg  = (tot)/((PetscLogDouble) size);
471:   if (min != 0.0) ratio = max/min;
472:   else ratio = 0.0;
473:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", max/1.0E6, ratio, avg/1.0E6, tot/1.0E6);

475:   /*   Memory */
476:   PetscMallocGetMaximumUsage(&mem);
477:   if (mem > 0.0) {
478:     MPIU_Allreduce(&mem,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
479:     MPIU_Allreduce(&mem,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
480:     MPIU_Allreduce(&mem,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
481:     avg  = (tot)/((PetscLogDouble) size);
482:     if (min != 0.0) ratio = max/min;
483:     else ratio = 0.0;
484:     PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", max/1024.0/1024.0, ratio, avg/1024.0/1024.0, tot/1024.0/1024.0);
485:   }
486:   /*   Messages */
487:   mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
488:   MPIU_Allreduce(&mess,         &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
489:   MPIU_Allreduce(&mess,         &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
490:   MPIU_Allreduce(&mess,         &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
491:   avg  = (tot)/((PetscLogDouble) size);
492:   if (min != 0.0) ratio = max/min;
493:   else ratio = 0.0;
494:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", max, ratio, avg, tot);

496:   /*   Message Volume */
497:   mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
498:   MPIU_Allreduce(&mess,         &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
499:   MPIU_Allreduce(&mess,         &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
500:   MPIU_Allreduce(&mess,         &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
501:   avg = (tot)/((PetscLogDouble) size);
502:   if (min != 0.0) ratio = max/min;
503:   else ratio = 0.0;
504:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", max/1024.0/1024.0, ratio, avg/1024.0/1024.0, tot/1024.0/1024.0);

506:   /*   Reductions */
507:   MPIU_Allreduce(&red,          &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
508:   MPIU_Allreduce(&red,          &max, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
509:   MPIU_Allreduce(&red,          &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
510:   if (min != 0.0) ratio = max/min;
511:   else ratio = 0.0;
512:   PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", max, ratio, -1, -1);
513:   PetscViewerXMLEndSection(viewer, "globalperformance");
514:   return(0);
515: }

517: typedef struct {
518:   PetscLogEvent  dftEvent;
519:   NestedEventId  nstEvent;
520:   PetscLogEvent  dftParent;
521:   NestedEventId  nstParent;
522:   PetscBool      own;
523:   int            depth;
524:   NestedEventId* nstPath;
525: } PetscNestedEventTree;

527: /* Compare timers to sort them in the tree */
528: static int compareTreeItems(const void *item1_, const void *item2_)
529: {
530:   int                  i;
531:   PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
532:   PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;
533:   for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
534:     if (item1->nstPath[i]<item2->nstPath[i]) return -1;
535:     if (item1->nstPath[i]>item2->nstPath[i]) return +1;
536:   }
537:   if (item1->depth < item2->depth) return -1;
538:   if (item1->depth > item2->depth) return 1;
539:   return 0;
540: }
541: /*
542:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
543:  * calls that happen in some processes, but not in others.
544:  *
545:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
546:  * The tree is sorted so that the timers can be printed in the order of appearance.
547:  *
548:  * For tree-items which appear in the trees of multiple processes (which will be most items), the 
549:  * following rule is followed:
550:  * + if information from my own process is available, then that is the information stored in tree.
551:  *   otherwise it is some other process's information.
552:  */
553: static PetscErrorCode  PetscCreateLogTreeNested(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
554: {
555:   PetscNestedEventTree *tree = NULL, *newTree;
556:   int                  *treeIndices;
557:   int                  nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
558:   int                  yesno;
559:   PetscBool            done;
560:   PetscErrorCode       ierr;
561:   int                  maxdepth;
562:   int                  depth;
563:   int                  illegalEvent;
564:   int                  iextra;
565:   PetscStageLog        stageLog;
566:   NestedEventId        *nstPath, *nstMyPath;
567:   MPI_Comm             comm;

570:   PetscObjectGetComm((PetscObject)viewer,&comm);
571:   PetscLogGetStageLog(&stageLog);

573:   /* Calculate memory needed to store everybody's information and allocate tree */
574:   nTimers = 0;
575:   for (i=0; i<nNestedEvents; i++) nTimers+=nestedEvents[i].nParents;

577:   PetscMalloc1(nTimers,&tree);

579:   /* Fill tree with readily available information */
580:   iTimer0 = 0;
581:   maxDefaultTimer =0;
582:   for (i=0; i<nNestedEvents; i++) {
583:     int           nParents          = nestedEvents[i].nParents;
584:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
585:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
586:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
587:     for (j=0; j<nParents; j++) {
588:       maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);

590:       tree[iTimer0+j].dftEvent   = dftEvents[j];
591:       tree[iTimer0+j].nstEvent   = nstEvent;
592:       tree[iTimer0+j].dftParent  = dftParentsSorted[j];
593:       tree[iTimer0+j].own        = PETSC_TRUE;

595:       tree[iTimer0+j].nstParent  = 0;
596:       tree[iTimer0+j].depth      = 0;
597:       tree[iTimer0+j].nstPath    = NULL;
598:     }
599:     iTimer0 += nParents;
600:   }

602:   /* Calculate the global maximum for the default timer index, so array treeIndices can 
603:    * be allocated only once */
604:   MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
605:   maxDefaultTimer = j;

607:   /* Find default timer's place in the tree */
608:   PetscCalloc1(maxDefaultTimer+1,&treeIndices);
609:   treeIndices[0] = 0;
610:   for (i=0; i<nTimers; i++) {
611:     PetscLogEvent dftEvent = tree[i].dftEvent;
612:     treeIndices[dftEvent] = i;
613:   }

615:   /* Find each dftParent's nested identification */
616:   for (i=0; i<nTimers; i++) {
617:     PetscLogEvent dftParent = tree[i].dftParent;
618:     if (dftParent!= DFT_ID_AWAKE) {
619:       int j = treeIndices[dftParent];
620:       tree[i].nstParent  = tree[j].nstEvent;
621:     }
622:   }

624:   /* Find depths for each timer path */
625:   done = PETSC_FALSE;
626:   maxdepth = 0;
627:   while (!done) {
628:     done = PETSC_TRUE;
629:     for (i=0; i<nTimers; i++) {
630:       if (tree[i].dftParent == DFT_ID_AWAKE) {
631:         tree[i].depth = 1;
632:         maxdepth = PetscMax(1,maxdepth);
633:       } else {
634:         int j = treeIndices[tree[i].dftParent];
635:         depth = 1+tree[j].depth;
636:         if (depth>tree[i].depth) {
637:           done          = PETSC_FALSE;
638:           tree[i].depth = depth;
639:           maxdepth      = PetscMax(depth,maxdepth);
640:         }
641:       }
642:     }
643:   }

645:   /* Allocate the paths in the entire tree */
646:   for (i=0; i<nTimers; i++) {
647:     depth = tree[i].depth;
648:     PetscCalloc1(depth,&tree[i].nstPath);
649:   }

651:   /* Calculate the paths for all timers */
652:   for (depth=1; depth<=maxdepth; depth++) {
653:     for (i=0; i<nTimers; i++) {
654:       if (tree[i].depth==depth) {
655:         if (depth>1) {
656:           int    j = treeIndices[tree[i].dftParent];
657:           PetscMemcpy(tree[i].nstPath,tree[j].nstPath,(depth-1)*sizeof(NestedEventId));
658:         }
659:         tree[i].nstPath[depth-1] = tree[i].nstEvent;
660:       }
661:     }
662:   }
663:   PetscFree(treeIndices);

665:   /* Sort the tree on basis of the paths */
666:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

668:   /* Allocate an array to store paths */
669:   depth = maxdepth;
670:   MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
671:   PetscMalloc1(maxdepth+1, &nstPath);
672:   PetscMalloc1(maxdepth+1, &nstMyPath);

674:   /* Find an illegal nested event index (1+largest nested event index) */
675:   illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
676:   i = illegalEvent;
677:   MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);

679:   /* First, detect timers which are not available in this process, but are available in others
680:    *        Allocate a new tree, that can contain all timers
681:    * Then,  fill the new tree with all (own and not-own) timers */
682:   newTree= NULL;
683:   for (yesno=0; yesno<=1; yesno++) {
684:     depth  = 1;
685:     i      = 0;
686:     iextra = 0;
687:     while (depth>0) {
688:       int       j;
689:       PetscBool same;

691:       /* Construct the next path in this process's tree:
692:        * if necessary, supplement with invalid path entries */
693:       depth++;
694:       if (depth > maxdepth + 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Depth %d > maxdepth+1 %d",depth,maxdepth+1);
695:       if (i<nTimers) {
696:         for (j=0;             j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
697:         for (j=tree[i].depth; j<depth;         j++) nstMyPath[j] = illegalEvent;
698:       } else {
699:         for (j=0;             j<depth;         j++) nstMyPath[j] = illegalEvent;
700:       }
701: 
702:       /* Communicate with other processes to obtain the next path and its depth */
703:       MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
704:       for (j=depth-1; (int) j>=0; j--) {
705:         if (nstPath[j]==illegalEvent) depth=j;
706:       }
707: 
708:       if (depth>0) {
709:         /* If the path exists */
710: 
711:         /* check whether the next path is the same as this process's next path */
712:         same = PETSC_TRUE;
713:         for (j=0; same && j<depth; j++) { same = (same &&  nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}
714: 
715:         if (same) {
716:           /* Register 'own path' */
717:           if (newTree) newTree[i+iextra] = tree[i];
718:           i++;
719:         } else {
720:           /* Register 'not an own path' */
721:           if (newTree) {
722:             newTree[i+iextra].nstEvent   = nstPath[depth-1];
723:             newTree[i+iextra].own        = PETSC_FALSE;
724:             newTree[i+iextra].depth      = depth;
725:             PetscMalloc1(depth, &newTree[i+iextra].nstPath);
726:             for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}

728:             newTree[i+iextra].dftEvent  = 0;
729:             newTree[i+iextra].dftParent = 0;
730:             newTree[i+iextra].nstParent = 0;
731:           }
732:           iextra++;
733:         }

735:       }
736:     }

738:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
739:     totalNTimers = nTimers + iextra;
740:     if (!newTree) {
741:       PetscMalloc1(totalNTimers, &newTree);
742:     }
743:   }
744:   PetscFree(nstPath);
745:   PetscFree(nstMyPath);
746:   PetscFree(tree);
747:   tree = newTree;
748:   newTree = NULL;

750:   /* Set return value and return */
751:   *p_tree    = tree;
752:   *p_nTimers = totalNTimers;
753:   return(0);
754: }

756: /*
757:  * Delete the nested timer tree 
758:  */
759: static PetscErrorCode  PetscLogFreeNestedTree(PetscNestedEventTree *tree, int nTimers)
760: {
761:   int             i;
762:   PetscErrorCode  ierr;
763: 
765:   for (i=0; i<nTimers; i++) {
766:     PetscFree(tree[i].nstPath);
767:   }
768:   PetscFree(tree);
769:   return(0);
770: }

772: /* Print the global performance: max, max/min, average and total of 
773:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
774:  */
775: static PetscErrorCode  PetscPrintXMLNestedLinePerfResults(PetscViewer viewer, const char *name, PetscLogDouble minvalue, PetscLogDouble maxvalue, PetscLogDouble minmaxtreshold)
776: {

780:   PetscViewerXMLStartSection(viewer, name, NULL);
781:   if (maxvalue>minvalue*minmaxtreshold) {
782:     PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%f");
783:     PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%f");
784:   } else {
785:     PetscViewerXMLPutDouble(viewer, "value", NULL, (minvalue+maxvalue)/2.0, "%g");
786:   };
787:   PetscViewerXMLEndSection(viewer, name);
788:   return(0);
789: }

791: #define N_COMM 8
792: static PetscErrorCode  PetscLogPrintNestedLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
793: {
794:   PetscLogDouble time = perfInfo.time;
795:   PetscLogDouble timeMx,          timeMn;
796:   PetscLogDouble countsPerCallMx, countsPerCallMn;
797:   PetscLogDouble reductSpeedMx,   reductSpeedMn;
798:   PetscLogDouble flopSpeedMx,     flopSpeedMn;
799:   PetscLogDouble msgSpeedMx,      msgSpeedMn;
800:   PetscLogDouble commarr_in[N_COMM], commarr_out[N_COMM];
802:   MPI_Comm       comm;

805:   PetscObjectGetComm((PetscObject)viewer,&comm);

807:   commarr_in[0] =  time;
808:   commarr_in[1] = -time;
809:   MPIU_Allreduce(commarr_in, commarr_out,    2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
810:   timeMx =  commarr_out[0];
811:   timeMn = -commarr_out[1];

813:   commarr_in[0] = time>=timeMx*0.001 ?  perfInfo.flops/time         : 0;
814:   commarr_in[1] = time>=timeMx*0.001 ?  perfInfo.numReductions/time : 0;
815:   commarr_in[2] = time>=timeMx*0.001 ?  perfInfo.messageLength/time : 0;
816:   commarr_in[3] = parentCount>0    ?  countsPerCall      : 0;

818:   commarr_in[4] = time>=timeMx*0.001 ? -perfInfo.flops/time         : -1e30;
819:   commarr_in[5] = time>=timeMx*0.001 ? -perfInfo.numReductions/time : -1e30;
820:   commarr_in[6] = time>=timeMx*0.001 ? -perfInfo.messageLength/time : -1e30;
821:   commarr_in[7] = parentCount>0    ? -countsPerCall      : -1e30;

823:   MPIU_Allreduce(commarr_in, commarr_out,  N_COMM, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

825:   flopSpeedMx     =  commarr_out[0];
826:   reductSpeedMx   =  commarr_out[1];
827:   msgSpeedMx      =  commarr_out[2];
828:   countsPerCallMx =  commarr_out[3];

830:   flopSpeedMn     = -commarr_out[4];
831:   reductSpeedMn   = -commarr_out[5];
832:   msgSpeedMn      = -commarr_out[6];
833:   countsPerCallMn = -commarr_out[7];

835:   *isPrinted = ((timeMx/totalTime) > (threshTime/100.0)) ? PETSC_TRUE : PETSC_FALSE;
836:   if (isPrinted) {
837:     PetscViewerXMLStartSection(viewer, "event", NULL);
838:     PetscViewerXMLPutString(viewer, "name", NULL, name);
839:     PetscPrintXMLNestedLinePerfResults(viewer, "time", timeMn/totalTime*100.0, timeMx/totalTime*100.0, 1.02);


842:     if (countsPerCallMx<1.01 && countsPerCallMn>0.99) {
843:       /* One call per parent */
844:     } else {
845:       PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", countsPerCallMn, countsPerCallMx, 1.02);
846:     }
847: 
848:     if (flopSpeedMx<0.01) {
849:       /* NO flops: don't print */
850:     } else {
851:       PetscPrintXMLNestedLinePerfResults(viewer, "mflops", flopSpeedMn/1e6, flopSpeedMx/1e6, 1.05);
852:     }
853: 
854:     if (msgSpeedMx<0.01) {
855:       /* NO msgs: don't print */
856:     } else {
857:       PetscPrintXMLNestedLinePerfResults(viewer, "mbps", msgSpeedMn/1024.0/1024.0, msgSpeedMx/1024.0/1024.0, 1.05);
858:     }
859: 
860:     if (reductSpeedMx<0.01) {
861:       /* NO reductions: don't print */
862:     } else {
863:       PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", reductSpeedMn, reductSpeedMx, 1.05);
864:     }
865:   }
866:   return(0);
867: }

869: /* Count the number of times the parent event was called */

871: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
872: {
873:    if (tree[i].depth<=1) {
874:      return 1;  /* Main event: only once */
875:    } else if (!tree[i].own) {
876:      return 1;  /* This event didn't happen in this process, but did in another */
877:    } else {
878:      int iParent;
879:      for (iParent=i-1; iParent>=0; iParent--) {
880:        if (tree[iParent].depth == tree[i].depth-1) break;
881:      }
882:      if (tree[iParent].depth != tree[i].depth-1) {
883:        printf("\n\n   *****  Internal error: cannot find parent ****\n\n");
884:        return -2;
885:      } else {
886:        PetscLogEvent dftEvent  = tree[iParent].dftEvent;
887:        return eventPerfInfo[dftEvent].count;
888:      }
889:    }
890: }

892: typedef struct {
893:   int             id;
894:   PetscLogDouble  val;
895: } PetscSortItem;

897: static int compareSortItems(const void *item1_, const void *item2_)
898: {
899:   PetscSortItem *item1 = (PetscSortItem *) item1_;
900:   PetscSortItem *item2 = (PetscSortItem *) item2_;
901:   if (item1->val > item2->val) return -1;
902:   if (item1->val < item2->val) return +1;
903:   return 0;
904: }

906: static PetscErrorCode  PetscLogNestedPrint(PetscViewer viewer, PetscNestedEventTree *tree,int nTimers, int iStart, PetscLogDouble totalTime)
907: {
908:   int                depth   = tree[iStart].depth;
909:   const char         *name;
910:   int                parentCount, nChildren;
911:   PetscSortItem      *children;
912:   PetscErrorCode     ierr;
913:   PetscEventPerfInfo *eventPerfInfo;
914:   PetscEventPerfInfo myPerfInfo,  otherPerfInfo, selfPerfInfo;
915:   PetscLogDouble     countsPerCall;
916:   PetscBool          wasPrinted;
917:   PetscBool          childWasPrinted;
918:   MPI_Comm           comm;

920:   {
921:   /* Look up the name of the event and its PerfInfo */
922:      const int          stage=0;
923:      PetscStageLog      stageLog;
924:      PetscEventRegInfo  *eventRegInfo;
925:      PetscLogGetStageLog(&stageLog);
926:      eventRegInfo  = stageLog->eventLog->eventInfo;
927:      eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
928:      name          = eventRegInfo[(PetscLogEvent) tree[iStart].nstEvent].name;
929:   }

931:   PetscObjectGetComm((PetscObject)viewer,&comm);

933:   /* Count the number of child processes */
934:   nChildren = 0;
935:   {
936:     int i;
937:     for (i=iStart+1; i<nTimers; i++) {
938:       if (tree[i].depth<=depth) break;
939:       if (tree[i].depth == depth + 1) nChildren++;
940:     }
941:   }

943:   if (nChildren>0) {
944:     /* Create an array for the id-s and maxTimes of the children,
945:      *  leaving 2 spaces for self-time and other-time */
946:     int            i;
947:     PetscLogDouble *times, *maxTimes;

949:     PetscMalloc1(nChildren+2,&children);
950:     nChildren = 0;
951:     for (i=iStart+1; i<nTimers; i++) {
952:       if (tree[i].depth<=depth) break;
953:       if (tree[i].depth == depth + 1) {
954:         children[nChildren].id  = i;
955:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
956:         nChildren++;
957:       }
958:     }

960:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
961:     PetscMalloc1(nChildren,&times);
962:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

964:     PetscMalloc1(nChildren,&maxTimes);
965:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
966:     PetscFree(times);

968:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
969:     PetscFree(maxTimes);
970:   }

972:   if (!tree[iStart].own) {
973:   /* Set values for a timer that was not activated in this process 
974:    * (but was, in other processes of this run) */
975:     PetscMemzero(&myPerfInfo,sizeof(myPerfInfo));

977:     selfPerfInfo  = myPerfInfo;
978:     otherPerfInfo = myPerfInfo;

980:     parentCount   = 1;
981:     countsPerCall = 0;
982:   } else {
983:   /* Set the values for a timer that was activated in this process */
984:     int           i;
985:     PetscLogEvent dftEvent   = tree[iStart].dftEvent;

987:     parentCount    = countParents( tree, eventPerfInfo, iStart);
988:     myPerfInfo     = eventPerfInfo[dftEvent];
989:     countsPerCall  = (PetscLogDouble) myPerfInfo.count / (PetscLogDouble) parentCount;

991:     selfPerfInfo                = myPerfInfo;
992:     otherPerfInfo.time          = 0;
993:     otherPerfInfo.flops         = 0;
994:     otherPerfInfo.numMessages   = 0;
995:     otherPerfInfo.messageLength = 0;
996:     otherPerfInfo.numReductions = 0;

998:     for (i=0; i<nChildren; i++) {
999:       /* For all child counters: subtract the child values from self-timers */

1001:       PetscLogEvent      dftChild  = tree[children[i].id].dftEvent;
1002:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

1004:       selfPerfInfo.time          -= childPerfInfo.time;
1005:       selfPerfInfo.flops         -= childPerfInfo.flops;
1006:       selfPerfInfo.numMessages   -= childPerfInfo.numMessages;
1007:       selfPerfInfo.messageLength -= childPerfInfo.messageLength;
1008:       selfPerfInfo.numReductions -= childPerfInfo.numReductions;

1010:       if ((children[i].val/totalTime) < (threshTime/100.0)) {
1011:         /* Add them to 'other' if the time is ignored in the output */
1012:         otherPerfInfo.time          += childPerfInfo.time;
1013:         otherPerfInfo.flops         += childPerfInfo.flops;
1014:         otherPerfInfo.numMessages   += childPerfInfo.numMessages;
1015:         otherPerfInfo.messageLength += childPerfInfo.messageLength;
1016:         otherPerfInfo.numReductions += childPerfInfo.numReductions;
1017:       }
1018:     }
1019:   }

1021:   /* Main output for this timer */
1022:   PetscLogPrintNestedLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);

1024:   /* Now print the lines for the children */
1025:   if (nChildren>0) {
1026:     /* Calculate max-times for 'self' and 'other' */
1027:     int            i;
1028:     PetscLogDouble times[2], maxTimes[2];
1029:     times[0] = selfPerfInfo.time;   times[1] = otherPerfInfo.time;
1030:     MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1031:     children[nChildren+0].id = -1;
1032:     children[nChildren+0].val = maxTimes[0];
1033:     children[nChildren+1].id = -2;
1034:     children[nChildren+1].val = maxTimes[1];

1036:     /* Now sort the children (including 'self' and 'other') on total time */
1037:     qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);

1039:     /* Print (or ignore) the children in ascending order of total time */
1040:     PetscViewerXMLStartSection(viewer,"events", NULL);
1041:     for (i=0; i<nChildren+2; i++) {
1042:       if ((children[i].val/totalTime) < (threshTime/100.0)) {
1043:         /* ignored: no output */
1044:       } else if (children[i].id==-1) {
1045:         PetscLogPrintNestedLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1046:         if (childWasPrinted) {
1047:           PetscViewerXMLEndSection(viewer,"event");
1048:         }
1049:       } else if (children[i].id==-2) {
1050:         size_t  len;
1051:         char    *otherName;

1053:         PetscStrlen(name,&len);
1054:         PetscMalloc1(16+len,&otherName);
1055:         sprintf(otherName,"%s: other-timed",name);
1056:         PetscLogPrintNestedLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1057:         PetscFree(otherName);
1058:         if (childWasPrinted) {
1059:           PetscViewerXMLEndSection(viewer,"event");
1060:         }
1061:       } else {
1062:         /* Print the child with a recursive call to this function */
1063:         PetscLogNestedPrint(viewer, tree, nTimers, children[i].id, totalTime);
1064:       }
1065:     }
1066:     PetscViewerXMLEndSection(viewer,"events");
1067:     PetscFree(children);
1068:   }

1070:   if (wasPrinted) {
1071:     PetscViewerXMLEndSection(viewer, "event");
1072:   }
1073:   return 0;
1074: }

1076: static PetscErrorCode  PetscLogNestedPrintTop(PetscViewer viewer, PetscNestedEventTree *tree,int nTimers, PetscLogDouble totalTime)
1077: {
1078:   int                nChildren;
1079:   PetscSortItem      *children;
1080:   PetscErrorCode     ierr;
1081:   PetscEventPerfInfo *eventPerfInfo;
1082:   MPI_Comm           comm;

1085:   PetscObjectGetComm((PetscObject)viewer,&comm);
1086:   {
1087:   /* Look up the PerfInfo */
1088:      const int          stage=0;
1089:      PetscStageLog      stageLog;
1090:      PetscLogGetStageLog(&stageLog);
1091:      eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1092:   }

1094:   /* Count the number of child processes, and count total time */
1095:   nChildren = 0;
1096:   {
1097:     int i;
1098:     for (i=0; i<nTimers; i++) {
1099:       if (tree[i].depth==1) nChildren++;
1100:     }
1101:   }

1103:   if (nChildren>0) {
1104:     /* Create an array for the id-s and maxTimes of the children,
1105:      *  leaving 2 spaces for self-time and other-time */
1106:     int            i;
1107:     PetscLogDouble *times, *maxTimes;

1109:     PetscMalloc1(nChildren,&children);
1110:     nChildren = 0;
1111:     for (i=0; i<nTimers; i++) {
1112:       if (tree[i].depth == 1) {
1113:         children[nChildren].id  = i;
1114:         children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1115:         nChildren++;
1116:       }
1117:     }
1118: 
1119:     /* Calculate the children's maximum times, to sort them */
1120:     PetscMalloc1(nChildren,&times);
1121:     for (i=0; i<nChildren; i++) { times[i] = children[i].val; }

1123:     PetscMalloc1(nChildren,&maxTimes);
1124:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1125:     PetscFree(times);

1127:     for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1128:     PetscFree(maxTimes);

1130:     /* Now sort the children on total time */
1131:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1132:     /* Print (or ignore) the children in ascending order of total time */
1133:     PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1134:     PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1135:     PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, threshTime, "%f");

1137:     for (i=0; i<nChildren; i++) {
1138:       if ((children[i].val/totalTime) < (threshTime/100.0)) {
1139:         /* ignored: no output */
1140:       } else {
1141:         /* Print the child with a recursive call to this function */
1142:         PetscLogNestedPrint(viewer, tree, nTimers, children[i].id, totalTime);
1143:       }
1144:     }
1145:     PetscViewerXMLEndSection(viewer, "timertree");
1146:     PetscFree(children);
1147:   }
1148:   return(0);
1149: }

1151: typedef struct {
1152:   char           *name;
1153:   PetscLogDouble time;
1154:   PetscLogDouble flops;
1155:   PetscLogDouble numMessages;
1156:   PetscLogDouble messageLength;
1157:   PetscLogDouble numReductions;
1158: } PetscSelfTimer;

1160: static PetscErrorCode  PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1161: {
1162:   PetscErrorCode     ierr;
1163:   PetscEventPerfInfo *eventPerfInfo;
1164:   PetscEventRegInfo  *eventRegInfo;
1165:   PetscSelfTimer     *selftimes;
1166:   PetscSelfTimer     *totaltimes;
1167:   NestedEventId      *nstEvents;
1168:   int                i, maxDefaultTimer;
1169:   NestedEventId      nst;
1170:   PetscLogEvent      dft;
1171:   int                nstMax, nstMax_local;
1172:   MPI_Comm           comm;
1173: 
1175:   PetscObjectGetComm((PetscObject)viewer,&comm);
1176:   {
1177:     const int          stage=0;
1178:     PetscStageLog      stageLog;
1179:     PetscLogGetStageLog(&stageLog);
1180:     eventRegInfo  = stageLog->eventLog->eventInfo;
1181:     eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1182:   }

1184:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1185:   maxDefaultTimer =0;
1186:   for (i=0; i<nNestedEvents; i++) {
1187:     int           nParents         = nestedEvents[i].nParents;
1188:     PetscLogEvent *dftEvents       = nestedEvents[i].dftEvents;
1189:      int j;
1190:      for (j=0; j<nParents; j++) {
1191:        maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1192:      }
1193:   }
1194:   PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1195:   for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1196:   for (i=0; i<nNestedEvents; i++) {
1197:     int           nParents          = nestedEvents[i].nParents;
1198:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
1199:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1200:     int           j;
1201:     for (j=0; j<nParents; j++) { nstEvents[dftEvents[j]] = nstEvent; }
1202:   }

1204:   /* Calculate largest nested event-ID */
1205:   nstMax_local = 0;
1206:   for (i=0; i<nNestedEvents; i++) { if (nestedEvents[i].nstEvent>nstMax_local) {nstMax_local = nestedEvents[i].nstEvent;} }
1207:   MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);


1210:   /* Initialize all total-times with zero */
1211:   PetscMalloc1(nstMax+1,&selftimes);
1212:   PetscMalloc1(nstMax+1,&totaltimes);
1213:   for (nst=0; nst<=nstMax; nst++) {
1214:     totaltimes[nst].time          = 0;
1215:     totaltimes[nst].flops         = 0;
1216:     totaltimes[nst].numMessages   = 0;
1217:     totaltimes[nst].messageLength = 0;
1218:     totaltimes[nst].numReductions = 0;
1219:     totaltimes[nst].name          = NULL;
1220:   }

1222:   /* Calculate total-times */
1223:   for (i=0; i<nNestedEvents; i++) {
1224:     const int            nParents  = nestedEvents[i].nParents;
1225:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1226:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1227:     int                  j;
1228:     for (j=0; j<nParents; j++) {
1229:       const PetscLogEvent dftEvent = dftEvents[j];
1230:       totaltimes[nstEvent].time          += eventPerfInfo[dftEvent].time;
1231:       totaltimes[nstEvent].flops         += eventPerfInfo[dftEvent].flops;
1232:       totaltimes[nstEvent].numMessages   += eventPerfInfo[dftEvent].numMessages;
1233:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1234:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1235:     }
1236:     totaltimes[nstEvent].name    = eventRegInfo[(PetscLogEvent) nstEvent].name;
1237:   }

1239:   /* Initialize: self-times := totaltimes */
1240:   for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }

1242:   /* Subtract timed subprocesses from self-times */
1243:   for (i=0; i<nNestedEvents; i++) {
1244:     const int           nParents          = nestedEvents[i].nParents;
1245:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1246:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1247:     int                 j;
1248:     for (j=0; j<nParents; j++) {
1249:       if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1250:         const PetscLogEvent dftEvent  = dftEvents[j];
1251:         const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1252:         selftimes[nstParent].time          -= eventPerfInfo[dftEvent].time;
1253:         selftimes[nstParent].flops         -= eventPerfInfo[dftEvent].flops;
1254:         selftimes[nstParent].numMessages   -= eventPerfInfo[dftEvent].numMessages;
1255:         selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1256:         selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1257:       }
1258:     }
1259:   }

1261:   PetscFree(nstEvents);
1262:   PetscFree(totaltimes);

1264:   /* Set outputs */
1265:   *p_self  = selftimes;
1266:   *p_nstMax = nstMax;
1267:   return(0);
1268: }

1270: static PetscErrorCode  PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1271: {
1272:   PetscErrorCode     ierr;
1273:   int                i;
1274:   NestedEventId      nst;
1275:   PetscSortItem      *sortSelfTimes;
1276:   PetscLogDouble     *times, *maxTimes;
1277:   PetscEventRegInfo  *eventRegInfo;
1278:   const int          dum_depth = 1, dum_count=1, dum_parentcount=1;
1279:   PetscBool          wasPrinted;
1280:   MPI_Comm           comm;

1283:   PetscObjectGetComm((PetscObject)viewer,&comm);
1284:   {
1285:     PetscStageLog      stageLog;
1286:     PetscLogGetStageLog(&stageLog);
1287:     eventRegInfo  = stageLog->eventLog->eventInfo;
1288:   }

1290:   PetscMalloc1(nstMax+1,&times);
1291:   PetscMalloc1(nstMax+1,&maxTimes);
1292:   for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1293:   MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1294:   PetscFree(times);

1296:   PetscMalloc1(nstMax+1,&sortSelfTimes);

1298:   /* Sort the self-timers on basis of the largest time needed */
1299:   for (nst=0; nst<=nstMax; nst++) {
1300:     sortSelfTimes[nst].id  = nst;
1301:     sortSelfTimes[nst].val = maxTimes[nst];
1302:   }
1303:   PetscFree(maxTimes);
1304:   qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);

1306:   PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1307:   PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");

1309:   for (i=0; i<=nstMax; i++) {
1310:     if ((sortSelfTimes[i].val/totalTime) >= (threshTime/100.0)) {
1311:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1312:       PetscEventPerfInfo selfPerfInfo;
1313:       const char         *name     = eventRegInfo[(PetscLogEvent) nstEvent].name;

1315:       selfPerfInfo.time          = selftimes[nstEvent].time ;
1316:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1317:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1318:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1319:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;
1320: 
1321:       PetscLogPrintNestedLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1322:       if (wasPrinted){
1323:         PetscViewerXMLEndSection(viewer, "event");
1324:       }
1325:     }
1326:   }
1327:   PetscViewerXMLEndSection(viewer, "selftimertable");
1328:   PetscFree(sortSelfTimes);
1329:   return(0);
1330: }

1332: PetscErrorCode  PetscLogView_Nested(PetscViewer viewer)
1333: {
1334:   MPI_Comm           comm;
1335:   PetscErrorCode     ierr;
1336:   PetscLogDouble     locTotalTime, globTotalTime;
1337:   PetscNestedEventTree *tree = NULL;
1338:   PetscSelfTimer     *selftimers = NULL;
1339:   int                nTimers = 0, nstMax = 0;
1340:   PetscViewerType    vType;

1343:   PetscViewerGetType(viewer,&vType);

1345:   /* Set useXMLFormat that controls the format in all local PetscPrint.. functions */
1346:   PetscViewerInitASCII_XML(viewer);

1348:   PetscObjectGetComm((PetscObject)viewer,&comm);

1350:   PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1351:   PetscViewerXMLStartSection(viewer, "petscroot", NULL);

1353:   /* Get the total elapsed time, local and global maximum */
1354:   PetscTime(&locTotalTime);  locTotalTime -= petsc_BaseTime;
1355:   MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

1357:   /* Print global information about this run */
1358:   PetscPrintExeSpecs(viewer);
1359:   PetscPrintGlobalPerformance(viewer, locTotalTime);
1360: 
1361:   /* Collect nested timer tree info from all processes */
1362:   PetscCreateLogTreeNested(viewer, &tree, &nTimers);
1363:   PetscLogNestedPrintTop(viewer, tree, nTimers, globTotalTime);
1364:   PetscLogFreeNestedTree(tree, nTimers);

1366:   /* Calculate self-time for all (not-nested) events */
1367:   PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1368:   PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1369:   PetscFree(selftimers);

1371:   PetscViewerXMLEndSection(viewer, "petscroot");
1372:   PetscViewerFinalASCII_XML(viewer);
1373:   PetscLogNestedEnd();
1374:   return(0);
1375: }

1377: #endif