#! /bin/csh -f # # reduce a list of filenames to a set of patterns # goto Setup Help: cat << EOF usage: sumarize.com /directory or sumarize.com filelist.txt EOF exit 9 Setup: set slots = 14 set tempfile = /tmp/sumarize_tmp$$ #set tempfile = tempfile if($#argv == 0) goto Help foreach arg ( $* ) if(-d $arg ) then find $arg -type f >! ${tempfile}.filenames endif if(-f $arg ) then # list of files cat $arg >! ${tempfile}.filenames endif if($arg =~ -nodata*) then set NO_DATASETS endif if($arg =~ [1-9]*) then set temp = `echo $arg | awk '$1+0>0 && $1+0<=1000 && $1==int($1+0){print $1}'` if("$temp" != "") set slots = "$temp" endif end if(! -e ${tempfile}.filenames) goto Help ################################################# # identify image file runs (preserving the order) cat ${tempfile}.filenames |\ awk -v nds=$?NO_DATASETS '{file=$0;pattern=file;default[pattern]=file;n=""}\ ! nds && /[0-9][0-9][0-9]\.[a-z][a-z][a-z]$/{n=split(file,a,"."); ext=a[n];\ pattern=substr(file,1,length(file)-length(ext)-4) "###." ext;\ n=substr(file,length(file)-length(ext)-3)+0;default[pattern]=file;}\ low[pattern]==""{low[pattern]=n; ++patterns; order[patterns]=pattern}\ n=high[pattern]{high[pattern]=n}\ END{\ for(i=1;i<=patterns;++i) {pattern=order[i];\ if(low[pattern]==high[pattern]) {print 1, default[pattern]; continue};\ printf "%d %s %s-%s\n", high[pattern]-low[pattern]+1, pattern, low[pattern], high[pattern]};}' |\ cat >! ${tempfile}.patterns # format: #files pattern [range] # see how many lines of text we will need set lines = `cat ${tempfile}.patterns | wc -l` if($lines <= $slots) then cat ${tempfile}.patterns |\ awk '{print $1,NR,$2,$3,$4}' |\ cat >! ${tempfile}.labels # format: #files order pattern [range] goto printout endif if($lines > 10000) then # this is a LOT! sumarize by directory cat ${tempfile}.filenames |\ awk -F "[/]" '{printf "%d ",NF} \ NF<3{print 1,$0;next}\ {for(i=1;i! ${tempfile}.dirs # format: count dirname # count how many slots arise from different depths cat ${tempfile}.filenames |\ awk -v slots=$slots -F "[/]" '{path="";for(depth=1;depthmaxdepth{maxdepth=NF} \ END{for (depth=1;depth<=maxdepth;++depth) print depth,diversity[depth]+0}' |\ sort -nr >! ${tempfile}.diversity set depth = `sort -n ${tempfile}.diversity | awk -v slots=$slots '$2>slots{print last;exit} {last=$1}'` cat ${tempfile}.dirs |\ awk -v depth=$depth -F "[/]" '{for(i=1;i<=depth && i! ${tempfile}.patterns # format: #files dirname endif if(0) then # count how many slots arise from different depths cat ${tempfile}.filenames |\ awk -v slots=$slots -F "[/]" '{path="";\ for(i=1;i<=NF;++i){path=path"/"$i;if(! seen[path]){\ \ }}}' |\ sort -nr >! ${tempfile}.diversity endif ################################################# # must further sumarize patterns to trim this down # keep a few patterns that represent lots of images cat ${tempfile}.patterns |\ awk '{n=$1;$1=""; print n, NR, $0}' |\ sort -nr |\ awk -v slots=$slots '{kp="pass"} $1 >10 && hits < slots/2{++hits;kp="keep"}\ {print kp, $0}' |\ sort -n +2 |\ cat >! ${tempfile}.marked #format: keep/pass #files order pattern [range] rm -f ${tempfile}.patterns >& /dev/null # partition the list into two awk '/^keep /{print substr($0,6)}' ${tempfile}.marked |\ cat >! ${tempfile}.labels #format: #files order pattern [range] awk '/^pass /{print substr($0,6)}' ${tempfile}.marked |\ cat >! ${tempfile}.leftover #format: #files order pattern [range] rm -f ${tempfile}.marked >& /dev/null set taken = `cat ${tempfile}.labels | wc -l` set freeslots = `echo $slots $taken | awk '{print $1-$2}'` # high-impact patterns are saved in ${tempfile}.labels # now, what to do with the remaining ones... # find the maximum-length truncation that gives < $slots patterns sort +2 ${tempfile}.leftover |\ awk -v freeslots=$freeslots '{file[NR]=$3} length($3)>len{len=length($3)} \ END{templates=NR;\ while(freeslots < templates){--len;\ #print "len="len;\ delete template ; templates=0;\ for(n=1;n<=NR;++n){\ template[substr(file[n],1,len)]=1;\ templates=0; for(pattern in template){++templates};\ #print templates, "templates", n, "files";\ if(freeslots < templates) break}};\ for(pattern in template) print pattern}' |\ sort -u >! ${tempfile}.templates set len = `awk '{print length($3)+1}' ${tempfile}.templates | sort -n | tail -1` set templates = `cat ${tempfile}.templates | wc -l` if($len == 0) then # this can happen... endif foreach template ( `cat ${tempfile}.templates` ) # extract list of current patterns that will match cat ${tempfile}.leftover |\ awk -v template=$template '$3 ~ "^"template'|\ cat >! ${tempfile}.hits # extend this template as much as possible with a single pattern cat ${tempfile}.hits |\ awk '{t=0;for(i=1;i<=length($3);++i){c=substr($3,i,1);\ if(seen[i] !~ c"$"||t==1){t=1;++count[i];seen[i]=seen[i] c}}} \ END{for(i=1;seen[i]!="";++i) print count[i], i, seen[i]}' |\ cat >! ${tempfile}.branches set head = `awk '$1>1{exit} {print $2}' ${tempfile}.branches | tail -1` set head = `awk -v head=$head '{print substr($3,1,head);exit}' ${tempfile}.hits` # how many files are we going to match? set order = `awk '{print $2;exit}' ${tempfile}.hits` set num = `awk '{num+=$1} END{print num}' ${tempfile}.hits` set range = $num if ($num == 1) set range = "" # find a tail-end pattern too cat ${tempfile}.hits |\ awk '{t=0;for(i=1;i<=length($3);++i){c=substr($3,length($3)-i+1,1);\ if(seen[i] !~ c"$"||t==1){t=1;++count[i];seen[i]=seen[i] c}}} \ END{for(i=1;seen[i]!="";++i) print count[i], i, seen[i]}' |\ cat >! ${tempfile}.tail set tail = `awk '$1>1{exit} {print $2}' ${tempfile}.tail | tail -1` set tail = `awk -v tail=$tail '{print substr($3,length($3)-tail+1);exit}' ${tempfile}.hits` if ("$tail" == "$head") then set tail = "" set range = `awk '{print $4; exit}' ${tempfile}.hits` else set noglob set tail = "*$tail" if ($range != "") set range = "($range)" endif # save this echo "$num $order ${head}${tail} $range" >> ${tempfile}.labels end rm -f ${tempfile}.hits >& /dev/null rm -f ${tempfile}.head >& /dev/null rm -f ${tempfile}.tail >& /dev/null rm -f ${tempfile}.leftover >& /dev/null rm -f ${tempfile}.branches >& /dev/null rm -f ${tempfile}.templates >& /dev/null rm -f ${tempfile}.filenames >& /dev/null # at this point, we are gaurenteed to have <= $slots labels (I think...) printout: # sort the list and # strip off anything that will not actually get printed out cat ${tempfile}.labels |\ sort -n +1 |\ awk '{print $3,$4}' |\ cat rm -f ${tempfile}.labels >& /dev/null #exit rm -f ${tempfile}.branches >& /dev/null rm -f ${tempfile}.dated >& /dev/null rm -f ${tempfile}.hits >& /dev/null rm -f ${tempfile}label >& /dev/null rm -f ${tempfile}.patterns >& /dev/null rm -f ${tempfile}.runinfo >& /dev/null rm -f ${tempfile}.slots >& /dev/null rm -f ${tempfile}.tail >& /dev/null rm -f ${tempfile}.templates >& /dev/null rm -f ${tempfile}.filenames >& /dev/null exit