/**
@file dirThread.cc
@author Mitch Richling <http://www.mitchr.me/>
@Copyright Copyright 1998 by Mitch Richling. All rights reserved.
@brief Threaded traversal of UNIX directory trees@EOL
@Keywords UNIX readdir readdir_r threaded directory traversal
@Std ISOC POSIX UNIX98 BSD4.3 SYSV3
This C++ program is intended to illustrate how to
traverse a directory tree using a high performance
threaded approach. It simply prints out the names of
all files found.
The performance boost comes because this code can
multiplex several I/O calls. The resulting speed boost
is often 50x for network file systems.
In operation, this program is rather similar to
dirDepthC.cc in that it reads all the file names in a
directory and places sub-directory names into an STL
container to be traversed later. Unlike dirDepthC.cc
this method provides an unknown ordering for the
traversal -- the price we pay increased performance.
Another difference is that we use readdir_r instead of
readdir. While readir() uses a static structure to
return results, this isn't necessary a problem for this
program because the POSIX standard requires that
readir() use a DIFFERENT static structure per directory
handle. Thus readdir() COULD be POSIX compliant and
thread safe in this application because no two threads
ever read the same directory. Unfortunately the POSIX
standard expressly allows readdir() to be non-thread
safe. For example, it could use some other hidden
global state across calls and still be within the
specification. In practice all God fearing UNIX-like
operating systems don't do this, and thus have a
perfectly safe readdir() for this application; however,
if someone decides to use this code on a heathen
UNIX-like OS we use readdir_r and accept the extra code,
malloc, and performance penalty.
A note on SAFEREADDIR: readir() uses a static structure
to return results, but the POSIX standard requires that
it use a DIFFERENT static structure per directory
handle. Thus readdir() COULD be POSIX compliant and
thread safe in our application -- no two threads ever
read the same directory. Unfortunately the standard
expressly allows readdir to be non-thread safe. For
example, it could use some other hidden global state
across calls and still be in spec. In practice most God
fearing UNIX-like OSes don't do this, and thus have a
perfectly safe readdir() for this application; however,
if someone decides to use this code on a heathen
UNIX-like OS we provide the option to use readdir_r via
SAFEREADDIR and accept the extra code, malloc, and
performance (lock) penalty. Note that many systems have
a lock around the innards of readdir_r that allow the
process to only run one at a time, and that will
dramatically slow things down when SAFEREADDIR is 1.
The string handling is C-style, as that is code is
heavily laden with UNIX C API calls, and they all use
old school, null terminated, C-strings. While
everything is a C-string, we don't always use the string
functions strcpy() and strcat() because memcpy()
operates faster on most systems. The lengths of the
strings are known or must be computed anyhow, so
memcpy() is a good choice.
Note that this implementation doesn't suffer from
limitations on the number of open directory descriptors
like recursive approaches frequently do.
The functions used are all POSIX or ISO C++ and thus
this program will run on most UNIX-like OSes.
@Build
- MacOS X: c++ -I mtUtils dirThread.cc mtUtils/mtUtils.c
- Linux: c++ dirThread.cc -lpthread muUtils.c
- Solaris: CC -lpthread -lrt dirThread.cc muUtils.c
@Tested
- SPARC Solaris 7, 8
- x86_64 Solaris 10
- MacOS X.2, X.4, X.5, X.6
- RedHat 7.3
- RHEL 4.6, 4.8
- SLED 10.2
*/
#include <list> /* STL list C++ */
#include <sys/types.h> /* UNIX types POSIX */
#include <stdio.h> /* I/O lib ISOC */
#include <string.h> /* Strings ISOC */
#include <dirent.h> /* UNIX dirs POSIX */
#include <errno.h> /* error stf POSIX */
#include <utime.h> /* utime POSIX */
#include <sys/stat.h> /* UNIX stat POSIX */
#include <time.h> /* time ISOC */
#include <pthread.h> /* threads POSIX */
#include <sched.h> /* threads POSIX */
#include <stdlib.h> /* Standard Lib ISOC */
#include <unistd.h> /* UNIX std stf POSIX */
#include "mtUtils.h"
/** Maximum number-1 of threads to run at one time. */
#define NUMTHREADS 55
/** Number of directories to read before updating global linked list.
Set to 0 for adaptive maximum.*/
#define UPLOADTRIG 10
/** Use the absolutely thread-safe readdir_r function, or use the much
faster, mostly thread-safe readdir system call */
#define SAFEREADDIR 1
/* *************************************************************************** */
/** The list of dirs to traverse. */
std::list<char *> gblDirList;
/** Controls access to gblDirList */
pthread_mutex_t dirList_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t dirList_cond = PTHREAD_COND_INITIALIZER;
/* *************************************************************************** */
/* Prototypes. */
int main(int argc, char *argv[]);
/** Designed to be used by a thread. */
void readDir(char *workingDir);
/** This is a worker thread function that is intended to run readDir
as a function. It must have extern C linkage as that is part of
the POSIX threads standard. */
extern "C" { void workerThread(void *reqArg); }
/** Useful typedef for a POSIX thread function. Note we put it in
an extern block to avoid warnings about invalid types on some
compilers (Sun Compilers). */
extern "C" { typedef void* PTHRFUNC(void*); }
/** Number of threads with nothing to do. We protect this variable
with the same mutex we use for the global directory list. */
int numThreadsWaiting;
/* *************************************************************************** */
int main(int argc, char *argv[]) {
struct stat s;
char *wrkBuf;
pthread_t thread[NUMTHREADS];
int mustWait = 1, i;
/* Stat the root of the tree we are to traverse. */
if((argc == 2) && (lstat(argv[1], &s) < 0)) {
mtPrintf("ERROR: Bad first argument\n");
exit(1);
} /* end if */
/* Now that we have stat'ed' the thing, we add it as the list of directories */
wrkBuf = (char *)malloc(strlen(argv[1]) + 1);
strcpy(wrkBuf, argv[1]);
gblDirList.push_back(wrkBuf);
/* Lock the dirList_mutex to keep new threads from going right away. */
pthread_mutex_lock(&dirList_mutex);
/* Here we fire off n-threads */
numThreadsWaiting = 0;
for(i = 0; i < NUMTHREADS; i++) {
if(pthread_create(&(thread[i]), NULL, (PTHRFUNC*)workerThread, (void *)NULL) != 0) {
mtPrintf("ERROR(9): dirThread: pthread_create: Couldn't create thread.\n");
exit(1);
} /* end if */
pthread_detach(thread[i]);
} /* end for */
/* Unlock dirList_mutex to unleash all the threads..*/
pthread_mutex_unlock(&dirList_mutex);
/* Loop until we have nothing left to do */
mustWait = 1;
while(mustWait) {
pthread_mutex_lock(&dirList_mutex);
if((gblDirList.empty()) && (numThreadsWaiting == NUMTHREADS)) {
mustWait = 0;
} /* end if */
pthread_mutex_unlock(&dirList_mutex);
/* We broadcast here to avoid deadlock. */
pthread_cond_broadcast(&dirList_cond);
/* We sleep for a bit to reduce broadcast frequency. */
struct timespec nanotime;
nanotime.tv_sec = 0;
nanotime.tv_nsec = 1000;
nanosleep(&nanotime, NULL);
} /* end while */
pthread_mutex_unlock(&dirList_mutex);
/* On some platforms we get stuck unless we kill off the threads */
for(i = 0; i < NUMTHREADS; i++) {
pthread_cancel(thread[i]);
} /* end for */
/* Everything is done now */
return 0;
} /* end func main */
/* *************************************************************************** */
void workerThread(void *reqArg) {
char *workingDir;
/* Just set here until the master thread unlocks dirList_mutex! */
pthread_mutex_lock(&dirList_mutex);
pthread_mutex_unlock(&dirList_mutex);
while(1) {
pthread_mutex_lock(&dirList_mutex);
if(gblDirList.empty()) {
numThreadsWaiting++;
while(gblDirList.empty()) {
pthread_cond_wait(&dirList_cond, &dirList_mutex);
} /* end while */
/* If we get here, we have dirList_mutex locked. */
numThreadsWaiting--;
pthread_mutex_unlock(&dirList_mutex);
} else {
workingDir = gblDirList.back();
gblDirList.pop_back();
pthread_mutex_unlock(&dirList_mutex);
readDir(workingDir);
} /* end if/else */
} /* end while */
} /* end func workerThread */
/* *************************************************************************** */
void readDir(char *workingDir) {
DIR *dp;
struct dirent *dep;
#if SAFEREADDIR
struct dirent *depp;
int readdirRes;
#endif
struct stat s;
char *name; /* Working space for string manipulation */
long len_d_name, len_workingDirName; /* String lengths */
int haveMore;
int localDirListLen=0;
std::list<char *> localDirList; /* Local working list of directories */
if( (dp = opendir(workingDir)) == NULL) {
mtPrintf("ERROR(4): dirThread: opendir: %s: Undetermined\n", workingDir);
} else {
int name_max=pathconf(workingDir, _PC_NAME_MAX);
if(name_max <= 0) {
mtPrintf("ERROR(7): dirThread: pathconf: %s: Undetermined\n", workingDir);
exit(1);
} /* end if */
#if SAFEREADDIR
/* Some platforms have large dirent sizes... Broken if d_name not last thing... */
size_t depSize = offsetof(struct dirent, d_name) + name_max + 2;
if (depSize < sizeof(struct dirent)) // Make sure we are at east as bit as a dirent...
depSize = sizeof(struct dirent);
if((dep = (struct dirent *)malloc(depSize)) == NULL) {
mtPrintf("ERROR(8): dirThread: malloc: direntBlob: Undetermined\n");
exit(1);
} /* end if */
#endif
len_workingDirName = strlen(workingDir);
haveMore = 1;
while(haveMore) {
#if SAFEREADDIR
readdirRes = readdir_r(dp, dep, &depp);
#else
dep = readdir(dp);
#endif
if(
#if SAFEREADDIR
/* Note that we should do something different for errors for readdir_r... */
((depp == NULL) || (readdirRes != 0))
#else
/* Note that we should check errno for readdir as it might not be EOF but an error... */
(dep == NULL)
#endif
) {
haveMore = 0;
} else {
if((strcmp(dep->d_name, "..") != 0) && (strcmp(dep->d_name, ".") != 0)) {
len_d_name = strlen(dep->d_name);
if((name = (char *)malloc(len_d_name + len_workingDirName + 2)) == NULL) {
mtPrintf("ERROR(5): dirThread: malloc: Undetermined\n");
exit(1);
} /* end if */
strcpy(name, workingDir);
strcat(name, "/");
strcat(name, dep->d_name);
if(lstat(name, &s)) {
mtPrintf("ERROR(6): dirThread: lstat: %s: Undetermined\n", name);
free(name);
} else {
if(S_ISDIR(s.st_mode)) {
localDirList.push_back(name);
localDirListLen++;
mtPrintf(" D: %s\n", name);
/* If we found UPLOADTRIG directories, then put them in
the global list. We don't use localDirList.size() as
it might be O(1) or O(n) per the C++ standard. */
if(UPLOADTRIG && (localDirListLen > UPLOADTRIG)) {
pthread_mutex_lock(&dirList_mutex);
gblDirList.splice(gblDirList.end(), localDirList);
pthread_cond_broadcast(&dirList_cond);
pthread_mutex_unlock(&dirList_mutex);
} /* end if */
} else {
mtPrintf("ND: %s\n", name);
free(name);
} /* end if/else */
} /* end if/else */
} /* end if */
} /* end if */
} /* end while */
closedir(dp);
free(dep);
} /* end if/else */
pthread_mutex_lock(&dirList_mutex);
gblDirList.splice(gblDirList.end(), localDirList);
pthread_cond_broadcast(&dirList_cond);
pthread_mutex_unlock(&dirList_mutex);
/* Free up the string for this dirent. */
free(workingDir);
} /* end func readDir */
Generated by GNU Enscript 1.6.5.2.