dirThread.cc

/**
   @file      dirThread.cc
   @author    Mitch Richling <http://www.mitchr.me/>
   @Copyright Copyright 1998 by Mitch Richling.  All rights reserved.
   @brief     Threaded traversal of UNIX directory trees@EOL
   @Keywords  UNIX readdir readdir_r threaded directory traversal
   @Std       ISOC POSIX UNIX98 BSD4.3 SYSV3

              This C++ program is intended to illustrate how to
              traverse a directory tree using a high performance
              threaded approach.  It simply prints out the names of
              all files found.  

              The performance boost comes because this code can
              multiplex several I/O calls.  The resulting speed boost
              is often 50x for network file systems.
              
              In operation, this program is rather similar to
              dirDepthC.cc in that it reads all the file names in a
              directory and places sub-directory names into an STL
              container to be traversed later.  Unlike dirDepthC.cc
              this method provides an unknown ordering for the
              traversal -- the price we pay increased performance.

              Another difference is that we use readdir_r instead of
              readdir.  While readir() uses a static structure to
              return results, this isn't necessary a problem for this
              program because the POSIX standard requires that
              readir() use a DIFFERENT static structure per directory
              handle.  Thus readdir() COULD be POSIX compliant and
              thread safe in this application because no two threads
              ever read the same directory.  Unfortunately the POSIX
              standard expressly allows readdir() to be non-thread
              safe.  For example, it could use some other hidden
              global state across calls and still be within the
              specification.  In practice all God fearing UNIX-like
              operating systems don't do this, and thus have a
              perfectly safe readdir() for this application; however,
              if someone decides to use this code on a heathen
              UNIX-like OS we use readdir_r and accept the extra code,
              malloc, and performance penalty.

              A note on SAFEREADDIR: readir() uses a static structure
              to return results, but the POSIX standard requires that
              it use a DIFFERENT static structure per directory
              handle.  Thus readdir() COULD be POSIX compliant and
              thread safe in our application -- no two threads ever
              read the same directory.  Unfortunately the standard
              expressly allows readdir to be non-thread safe.  For
              example, it could use some other hidden global state
              across calls and still be in spec.  In practice most God
              fearing UNIX-like OSes don't do this, and thus have a
              perfectly safe readdir() for this application; however,
              if someone decides to use this code on a heathen
              UNIX-like OS we provide the option to use readdir_r via
              SAFEREADDIR and accept the extra code, malloc, and
              performance (lock) penalty.  Note that many systems have
              a lock around the innards of readdir_r that allow the
              process to only run one at a time, and that will
              dramatically slow things down when SAFEREADDIR is 1.
              
              The string handling is C-style, as that is code is
              heavily laden with UNIX C API calls, and they all use
              old school, null terminated, C-strings.  While
              everything is a C-string, we don't always use the string
              functions strcpy() and strcat() because memcpy()
              operates faster on most systems.  The lengths of the
              strings are known or must be computed anyhow, so
              memcpy() is a good choice.
              
              Note that this implementation doesn't suffer from
              limitations on the number of open directory descriptors
              like recursive approaches frequently do.

              The functions used are all POSIX or ISO C++ and thus
              this program will run on most UNIX-like OSes.

   @Build     
              - MacOS X: c++ -I mtUtils dirThread.cc mtUtils/mtUtils.c
              - Linux: c++ dirThread.cc -lpthread muUtils.c
              - Solaris: CC -lpthread -lrt dirThread.cc muUtils.c

   @Tested    
              - SPARC Solaris 7, 8
              - x86_64 Solaris 10
              - MacOS X.2, X.4, X.5, X.6
              - RedHat 7.3
              - RHEL 4.6, 4.8
              - SLED 10.2
*/

#include <list>                 /* STL list        C++   */
#include <sys/types.h>          /* UNIX types      POSIX */
#include <stdio.h>              /* I/O lib         ISOC  */
#include <string.h>             /* Strings         ISOC  */
#include <dirent.h>             /* UNIX dirs       POSIX */
#include <errno.h>              /* error stf       POSIX */
#include <utime.h>              /* utime           POSIX */
#include <sys/stat.h>           /* UNIX stat       POSIX */
#include <time.h>               /* time            ISOC  */
#include <pthread.h>            /* threads         POSIX */
#include <sched.h>              /* threads         POSIX */
#include <stdlib.h>             /* Standard Lib    ISOC  */
#include <unistd.h>             /* UNIX std stf    POSIX */

#include "mtUtils.h"

/** Maximum number-1 of threads to run at one time. */
#define NUMTHREADS 55

/** Number of directories to read before updating global linked list.
    Set to 0 for adaptive maximum.*/
#define UPLOADTRIG 10

/** Use the absolutely thread-safe readdir_r function, or use the much
    faster, mostly thread-safe readdir system call */
#define SAFEREADDIR 1

/* *************************************************************************** */
/** The list of dirs to traverse. */
std::list<char *> gblDirList;

/** Controls access to gblDirList */
pthread_mutex_t dirList_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t dirList_cond = PTHREAD_COND_INITIALIZER;

/* *************************************************************************** */
/* Prototypes. */

int main(int argc, char *argv[]);

/** Designed to be used by a thread. */
void readDir(char *workingDir);

/** This is a worker thread function that is intended to run readDir
    as a function. It must have extern C linkage as that is part of
    the POSIX threads standard. */ 
extern "C" { void workerThread(void *reqArg); }

/**  Useful typedef for a POSIX thread function.  Note we put it in
     an extern block to avoid warnings about invalid types on some
     compilers (Sun Compilers). */
extern "C" { typedef void*  PTHRFUNC(void*);  }

/** Number of threads with nothing to do. We protect this variable
    with the same mutex we use for the global directory list. */
int numThreadsWaiting;

/* *************************************************************************** */
int main(int argc, char *argv[]) {
  struct stat s;
  char *wrkBuf;
  pthread_t thread[NUMTHREADS];
  int mustWait = 1, i;

  /* Stat the root of the tree we are to traverse. */
  if((argc == 2) && (lstat(argv[1], &s) < 0)) {
    mtPrintf("ERROR: Bad first argument\n");
    exit(1);
  } /* end if */
  /* Now that we have stat'ed' the thing, we add it as the list of directories */
  wrkBuf = (char *)malloc(strlen(argv[1]) + 1);
  strcpy(wrkBuf, argv[1]);
  gblDirList.push_back(wrkBuf);
 
  /* Lock the dirList_mutex to keep new threads from going right away. */
  pthread_mutex_lock(&dirList_mutex);

  /* Here we fire off n-threads */
  numThreadsWaiting = 0;
  for(i = 0; i < NUMTHREADS; i++) {
    if(pthread_create(&(thread[i]), NULL, (PTHRFUNC*)workerThread, (void *)NULL) != 0) {
      mtPrintf("ERROR(9): dirThread: pthread_create: Couldn't create thread.\n");
      exit(1);
    } /* end if */
    pthread_detach(thread[i]);
  } /* end for */

  /* Unlock dirList_mutex to unleash all the threads..*/
  pthread_mutex_unlock(&dirList_mutex);

  /* Loop until we have nothing left to do */
  mustWait = 1;
  while(mustWait) {
    pthread_mutex_lock(&dirList_mutex);
    if((gblDirList.empty()) && (numThreadsWaiting == NUMTHREADS)) {
      mustWait = 0;
    } /* end if */
    pthread_mutex_unlock(&dirList_mutex);

    /* We broadcast here to avoid deadlock. */
    pthread_cond_broadcast(&dirList_cond);
    /* We sleep for a bit to reduce broadcast frequency. */
    struct timespec nanotime;
    nanotime.tv_sec = 0;
    nanotime.tv_nsec = 1000;
    nanosleep(&nanotime, NULL);

  } /* end while */
  pthread_mutex_unlock(&dirList_mutex);

  /* On some platforms we get stuck unless we kill off the threads */
  for(i = 0; i < NUMTHREADS; i++) {
    pthread_cancel(thread[i]);
  } /* end for */

  /* Everything is done now */
  return 0;
} /* end func main */

/* *************************************************************************** */
void workerThread(void *reqArg) {
  char *workingDir;

  /* Just set here until the master thread unlocks dirList_mutex! */
  pthread_mutex_lock(&dirList_mutex);
  pthread_mutex_unlock(&dirList_mutex);

  while(1) {
    pthread_mutex_lock(&dirList_mutex);
    if(gblDirList.empty()) {
      numThreadsWaiting++;
      while(gblDirList.empty()) {
        pthread_cond_wait(&dirList_cond, &dirList_mutex);
      } /* end while */
      /* If we get here, we have dirList_mutex locked. */
      numThreadsWaiting--;
      pthread_mutex_unlock(&dirList_mutex);
    } else {
      workingDir = gblDirList.back();
      gblDirList.pop_back();
      pthread_mutex_unlock(&dirList_mutex);
      readDir(workingDir);
    } /* end if/else */
  } /* end while */
} /* end func workerThread */

/* *************************************************************************** */
void readDir(char *workingDir) {
  DIR *dp;
  struct dirent *dep;
#if SAFEREADDIR
  struct dirent *depp;
  int readdirRes;
#endif
  struct stat s;
  char *name;                             /* Working space for string manipulation */
  long len_d_name, len_workingDirName;    /* String lengths */
  int haveMore;
  int localDirListLen=0;
  std::list<char *> localDirList;        /* Local working list of directories */

  if( (dp = opendir(workingDir)) == NULL) {
    mtPrintf("ERROR(4): dirThread: opendir: %s: Undetermined\n", workingDir);
  } else {
    int name_max=pathconf(workingDir, _PC_NAME_MAX);
    if(name_max <= 0) {
      mtPrintf("ERROR(7): dirThread: pathconf: %s: Undetermined\n", workingDir);
      exit(1);
    } /* end if */
#if SAFEREADDIR
    /* Some platforms have large dirent sizes...  Broken if d_name not last thing... */
    size_t depSize = offsetof(struct dirent, d_name) + name_max + 2;
    if (depSize < sizeof(struct dirent)) // Make sure we are at east as bit as a dirent...
      depSize = sizeof(struct dirent);
    if((dep = (struct dirent *)malloc(depSize)) == NULL) {
      mtPrintf("ERROR(8): dirThread: malloc: direntBlob: Undetermined\n");
      exit(1);
    } /* end if */
#endif
    len_workingDirName = strlen(workingDir);
    haveMore = 1;
    while(haveMore) {
#if SAFEREADDIR
      readdirRes = readdir_r(dp, dep, &depp);
#else
      dep = readdir(dp);
#endif
      if(
#if SAFEREADDIR
        /* Note that we should do something different for errors for readdir_r... */
        ((depp == NULL) || (readdirRes != 0))
#else
        /* Note that we should check errno for readdir as it might not be EOF but an error... */
        (dep == NULL)
#endif
        ) {
        haveMore = 0;
      } else {
        if((strcmp(dep->d_name, "..") != 0) && (strcmp(dep->d_name, ".") != 0)) {
          len_d_name = strlen(dep->d_name);
          if((name = (char *)malloc(len_d_name + len_workingDirName + 2)) == NULL) {
            mtPrintf("ERROR(5): dirThread: malloc: Undetermined\n");
            exit(1);
          } /* end if */
          strcpy(name, workingDir);
          strcat(name, "/");
          strcat(name, dep->d_name);
          if(lstat(name, &s)) {
            mtPrintf("ERROR(6): dirThread: lstat: %s: Undetermined\n", name);
            free(name);
          } else {
            if(S_ISDIR(s.st_mode)) {
              localDirList.push_back(name);
              localDirListLen++;
              mtPrintf(" D: %s\n", name);
              /* If we found UPLOADTRIG directories, then put them in
                 the global list. We don't use localDirList.size() as
                 it might be O(1) or O(n) per the C++ standard. */
              if(UPLOADTRIG && (localDirListLen > UPLOADTRIG)) {
                pthread_mutex_lock(&dirList_mutex);
                gblDirList.splice(gblDirList.end(), localDirList);
                pthread_cond_broadcast(&dirList_cond);
                pthread_mutex_unlock(&dirList_mutex);
              } /* end if */
            } else {
               mtPrintf("ND: %s\n", name);
              free(name);
            } /* end if/else */
          } /* end if/else */
        } /* end if */
      } /* end if */
    } /* end while */
    closedir(dp);
    free(dep);
  } /* end if/else */
  pthread_mutex_lock(&dirList_mutex);
  gblDirList.splice(gblDirList.end(), localDirList);

  pthread_cond_broadcast(&dirList_cond);
  pthread_mutex_unlock(&dirList_mutex);
  /* Free up the string for this dirent. */
  free(workingDir);
} /* end func readDir */

Generated by GNU Enscript 1.6.5.2.