#! /bin/csh -f
#
#	SGsearch.com	4.3			-James Holton   6-1-11
#
#  Space group search script for scala scripts
#
set rawfile  = raw.mtz
set script   = merge.com
set logfile  = logs/merge
set tempfile = ${CCP4_SCR}/SGsearch_temp$$

set SG   = P1		# read from $rawfile
set CELL		
set RES  = ""		# default to resolution in $script or $rawfile
set P1_scales_SG = ""


alias nawk /usr/bin/awk
nawk 'BEGIN{exit}' >& /dev/null
if($status) alias nawk awk

# command-line arguments (1 of 2)
foreach arg ( $* )
    if(-e "$arg") then
	if("$arg" =~ *.mtz) set rawfile = "$arg"
	if("$arg" =~ *.com) set script  = "$arg"
    endif
    if("$arg" == ignore) set IGNORE
end

if((! -e "$rawfile")||(! -e "$script")) then
cat << EOF
usage: $0 [$script] [$rawfile] $SG 2.2A
where:
 $script will scale and merge
 scaling, mergeing and truncating $rawfile

 $rawfile is the unmerged mtz file

 $SG is the "root" space group i.e.
    P222 for P222, P2221, P21212, and P212121

 2.2A is the resolution for scaling and mergeing


EOF
    exit 1
endif

ls -1 ${logfile}.*.log >& /dev/null
if((! $status)&&(! $?IGNORE)) then
    echo ""
    echo "NOTICE: using stats from ${logfile}.*.log"
    echo ""
endif
if(! -e logs) mkdir logs

echo -n "evaluating alternative space groups for $rawfile with $script"

##############################################################################
# update cell and SG variables from the raw MTZ header
#
##############################################################################
echo "HEADER" | mtzdump hklin $rawfile >! $tempfile
set SG = `nawk '/Space group/{print $5}' $tempfile`
set SG = `nawk '/Space group/{gsub("\047","");print $5}' $tempfile`
set SGnum = ` nawk '/Space group/{print $NF+0}' ${tempfile} `
set SG = ` nawk -F "[\047]" '/Space group/{print $2}' ${tempfile} `
set SG = ` nawk -v num=$SGnum '$1==num && NF>5{print $4}' ${CLIBD}/symop.lib `
set CELL = `nawk '/Cell Dimensions/{getline;getline;print}' $tempfile`
set mtzRES = `nawk '/Resolution Range/{getline;getline;print $6}' $tempfile`

# calculate unit cell volume
echo $CELL |\
nawk 'NF==6{s=atan2(1,0)/90; A=cos(s*$4); B=cos(s*$5); G=cos(s*$6); \
 skew = 1 + 2*A*B*G - A*A - B*B - G*G ; if(skew < 0) skew = -skew;\
 printf "%.3f\n", $1*$2*$3*sqrt(skew)}' |\
cat >! ${tempfile}volume
set CELLvolume = `cat ${tempfile}volume`
rm -f ${tempfile}volume >> /dev/null

# command-line arguments (2 of 2)
foreach arg ( $* )
    if("$arg" =~ [PpCcIiFfRrHh][1-6]*) set SG = `echo $arg | nawk '{print toupper($0)}'`
    if(("$arg" =~ [0-9]*)&&("$arg" !~ [a-z,A-Z]*)) then
	# a pure number is on the command line
	if("$arg" =~ *.*) then
	    # take it as resolution
	    set RES = `echo $arg | nawk '$1+0>0.5 && $1+0<10{print $1+0}'`
	else
	    # take it as ASU size
	    set ASU = `echo $arg | nawk '{printf "%d", $1+0}'`
	endif
    endif
    if("$arg" =~ [0-9]*A) then
	# looks like a resolution
	set temp = `echo $arg | nawk '$1+0>0.5 && $1+0<10{print $1+0}'`
	if($#temp) set RES = "$temp"
    endif
end


if("$RES" != "") then
    echo " to ${RES}A"
else
    echo ""
    set RES = "$mtzRES"
endif

# extract protein-compatible space groups from CCP4 in order of increasing symmetry
#nawk '$5 ~ /^PG/ && ! /m/ && ! /bar/ && $1 < 500' $CLIBD/symop.lib >! $tempfile
cat ${CLIBD}/symop.lib |\
nawk '$5 ~ /^PG/ && ! /m/ && ! /bar/ && $1 < 500{\
	printf "%s ", $0; for(op=$2;op>0;--op){getline;printf $1 $2};print ""}' |\
nawk '{print length($NF), $0}' |\
sort -n |\
nawk '{print $2,$3,$4,$5,$6,$7}' >! $tempfile


# get crystal system of $SG
set sys = `nawk -v SG=$SG '$4==SG{print $6}' $tempfile`
set latt = `echo $SG | nawk '{print substr($1,1,1)}'`

# extract point group representatives
nawk 'substr($4,2)==substr($5,3)' $tempfile >! ${tempfile}PGs
# extract point group representatives of the same lattice type
#nawk -v latt=$latt 'substr($4,1,1)==latt && substr($4,2)==substr($5,3)' $tempfile >! ${tempfile}PGs
nawk -v latt=$latt 'substr($4,1,1)==latt && ! seen[substr($5,3)]{print;++seen[substr($5,3)]}' $tempfile >! ${tempfile}PGs


# now get all point groups that co-index with $SG
if(("$sys" == "TRIGONAL") || ("$sys" == "HEXAGONAL")) then
    set SGs =  `nawk '$6 ~ /TRIG/{print $4} $6 ~ /HEXA/{print $4}' ${tempfile}PGs`
else
    set SGs =  `nawk -v S=$sys '$6==S{print $4}' ${tempfile}PGs`
endif
rm -f $tempfile ${tempfile}PGs

echo "SG      ASU(A^3)  ASU(~kD) Rmerge(%)   Brightest systematic absence"

SGcheck:

# add any extra considerations
set SGs = ( $SGs )
if("$SGs " !~ "P1 "*) set SGs = "P1 $SGs"

foreach SG ( $SGs )

    # get size of ASU (in A^3)
    set tempV = `nawk -v SG=$SG '$4==SG && $1 < 500{print $2}' $CLIBD/symop.lib`
    set tempV = `echo "$CELLvolume $tempV" | nawk '$2+0!=0{print $1/$2}'`
    if("$tempV" != "") set volume = "$tempV"

    # indicate P1 merge after P1_scales_SG scaling
    set printSG = "$SG"
    if("$P1_scales_SG" != "" && "$SG" == "P1" && "$P1_scales_SG" != "P1") then
	set SG = $P1_scales_SG
	set printSG = "${SG}-P1"
    endif
    
    # print it out in A^3 and kD at Vm=2.5
    echo "$printSG" | nawk '{printf "%-7s", $0}'
    echo "$volume" | nawk '{printf "%9d  %3d      ", $0, $0/2500}'   
    
    # remove bum log files
    if(-e ${logfile}.${printSG}.log) then
	grep Overall: ${logfile}.${printSG}.log >& /dev/null
	if(($status)||($?IGNORE)) rm -f ${logfile}.${printSG}.log
    endif
    
    # set up reindexing procedure
    set REINDEX = ""
    set new_SG  = "$SG"
    # check for "pseudo-spacegroups"
    set temp = `echo $SG | nawk '{print toupper($1)}'`
    if("$temp" =~ [PC]2[12][12]*) then
	set axes = ""
	if("$temp" == P2221) then
	    # P2221 with screw along longest axis
	    set axes  = "a b c"
	    set new_SG = "P2221"
	endif
	if("$temp" == P2212) then
	    # P2221 with screw along mid-length axis
	    set axes  = "b c a"
	    set new_SG = "P2221"
	endif
	if("$temp" == P2122) then
	    # P2221 with screw along shortest axis
	    set axes  = "c a b"
	    set new_SG = "P2221"
	endif
	
	if("$temp" == P21212) then
	    # P21212 with non-screw along longest axis
	    set axes  = "a b c"
	    set new_SG = "P21212"
	endif
	if("$temp" == P21221) then
	    # P21212 with non-screw along mid-length axis
	    set axes  = "b c a"
	    set new_SG = "P21212"
	endif
	if("$temp" == P22121) then
	    # P21212 with non-screw along shotest axis
	    set axes  = "c a b"
	    set new_SG = "P21212"
	endif
	    
	# cannonize these
	if("$new_SG" == "P222")    set axes = "a b c"
	if("$new_SG" == "P212121") set axes = "a b c"
	
	# decide on new axis ordering (for asymmetric orthorhombics)
	if("$axes" != "") then
	    # get current axis ordering
	    # find out what the cannonical one would be
	    # then decide how to go from current ordering to the desired one
	    echo "$CELL" | nawk '{\
		# print out current axis order \
		print $1, "h"; print $2, "k"; print $3, "l"}' |\
	    sort -n |\
	    nawk '\
		# add cannonical axis names\
		NR==1{print $0, "a"} NR==2{print $0, "b"} NR==3{print $0, "c"}' |\
	    nawk -v axes="$axes" 'BEGIN{split(axes, abc)} {\
		# write desired axis ordering in front of cannonical one \
		print abc[NR], $0}' |\
	    sort |\
	    cat >! ${tempfile}order
	    set op = `nawk '{printf "%s", $3} NR~/[12]/{print ","}'  ${tempfile}order`

	    # check for inverted hand
	    set test = `echo " ${op}, ${op}, " | grep " h, k, l," | wc -l`
	    if($test == 0) then
		# flip the last sign
		set op = `echo $op | awk '{print $1,$2,"-" $3}'`
	    endif

	    set REINDEX = "reindex $op"
	    rm -f ${tempfile}order >& /dev/null
	
	    # this should give us a mapping between any two orthorhombics
	endif
    endif
    if("$new_SG" =~ R*) set new_SG = $SGnum
    if("$new_SG" =~ H*) set new_SG = $SGnum


    # only run mergeing step if we havn't already done so
    if((! -e ${logfile}.${printSG}.log)||($?IGNORE)) then
	echo -n "" >! ${logfile}.${printSG}.log

	# apply the resolution cutoff
	echo "RESOLUTION $RES" |\
	mtzutils hklin1 $rawfile hklout ${tempfile}.cut.mtz >>& ${logfile}.${printSG}.log
	
	# now re-index the input, raw mtz file
	reindex HKLIN ${tempfile}.cut.mtz \
		HKLOUT ${tempfile}.reindexed.mtz << end_reindex >>& ${logfile}.${printSG}.log
SYMMETRY $new_SG
$REINDEX
END
end_reindex
	rm -f ${tempfile}.cut.mtz >& /dev/null
	
	# make sure it's sorted
	sortmtz hklout ${tempfile}.${SG}.mtz  << end_sort >>& ${logfile}.${printSG}.log
	#VRSET -9e+38
	H K L M/ISYM BATCH I SIGI
	${tempfile}.reindexed.mtz
end_sort
	rm -f ${tempfile}.reindexed.mtz >& /dev/null
 	
	# run the merge script
	if("$printSG" !~ *-P1) then
	    ./$script ${tempfile}.${SG}.mtz >>& ${logfile}.${printSG}.log
	else
	    # rewrite the script to scale with one SG and merge in P1
	    cat $script |\
	    nawk '{print} tolower($1)~/^cycle/{print "DUMP '${tempfile}'_scales.txt"}' |\
	    cat >! ${tempfile}_scale.com
            cat $script |\
            nawk '{print} tolower($1)~/^cycle/{print "RESTORE '${tempfile}'_scales.txt";print "ONLYMERGE"}' |\
            cat >! ${tempfile}_merge.com
	    chmod a+x ${tempfile}_scale.com ${tempfile}_merge.com
 	    ${tempfile}_scale.com ${tempfile}.${SG}.mtz >>& ${logfile}.${printSG}.log
	    # now merge in P1
	    echo "SYMM P1" |\
	    reindex hklin ${tempfile}.${SG}.mtz \
		 hklout ${tempfile}.${printSG}.mtz >>& ${logfile}.${printSG}.log
	    ${tempfile}_merge.com ${tempfile}.${printSG}.mtz >>& ${logfile}.${printSG}.log

#	    rm -f ${tempfile}_scale.com ${tempfile}_merge.com >& /dev/null
#	    rm -f ${tempfile}.scales.txt >& /dev/null
#	    rm -f ${tempfile}.${printSG}.mtz >& /dev/null
	endif
	
	# see if truncate was run
	grep "SUITE: TRUNCATE" ${logfile}.${printSG}.log >& /dev/null
	if($status) then
	    # truncate was not run, so lets try to run it
	    set mergedmtz = `nawk '/Filename:/{mtz=$NF} /Scala/&&/termination/{print mtz}' ${logfile}.${printSG}.log`
	    set ASU = `echo "$volume" | nawk '{print $0/2500}'`
	    if((-e "$mergedmtz")&&("$ASU" != "")) then
		echo "nresidue $ASU" |\
		truncate hklin $mergedmtz hklout ${tempfile}.mtz >>& ${logfile}.${printSG}.log
	    	if($status) then
		    # truncate failed, try it again? 
		endif
	    endif
	endif
	
	# clean up
	rm -f ${tempfile}.${SG}.mtz >& /dev/null
    endif
    
    # print out Rmerge
    set Rmerge = `nawk '/ Overall /{line=$0;getline;if($0~/Rmerge/){print substr(line,index($0,"Rmerge"),6)}}' logs/merge.${printSG}.log | tail -1`
    set test = `echo "$Rmerge" | awk '{print ($1+0 == 0)}'`
    if($test ) then
        set Rmerge = `nawk '/ Overall: /{print $2}' logs/merge.${printSG}.log | tail -1`
    endif
    set test = `echo "$Rmerge" | awk '{print ($1+0 == 0)}'`
    if($test) then
	# use "Ranom" instead?
	set Rmerge = `nawk '/ Overall: /{print $5}' logs/merge.${printSG}.log | tail -1`
    endif
    set test = `echo "$Rmerge" | awk '{print ($1+0 == 0)}'`
    if($test) then
	echo "-"
    else
	# print out Rmerge
	echo $Rmerge | nawk '{printf "%5.1f        ", $1*100}'
    
	# Look for Systematic absence intensities
	cat  logs/merge.${printSG}.log |\
	nawk '/Systematic absences are OMITTED/,/Distributions/' |\
	nawk 'NF==5 && ! /[a-z]/ && $5 != 0{printf "%-4d%-4d%-4d I/sig = %8.3f\n", $1, $2, $3, $4/$5}' |\
	nawk '{print $6"|"$0}' |\
	sort -n |\
	nawk -F "|" '{print $2}' >! $tempfile
	set Absence = `cat $tempfile | tail -1`
	if("$Absence" != "") then
	    # print out the worst one
	    tail -1 $tempfile | nawk '{printf "%s", $0}'
	endif
	rm -f $tempfile
    
	echo ""
    endif
end

# now, pick the best point/space group

# gather stats from last round of runs

if($?saved_SGs) then
    set SGs = ( $saved_SGs )
    unset saved_SGs
endif
echo "" >! $tempfile
foreach SG ( $SGs )

    # get size of ASU (in A^3)
    set temp = `nawk -v SG=$SG '$4==SG && $1 < 500{print $2}' $CLIBD/symop.lib`
    set temp = `echo "$CELLvolume $temp" | nawk '$2+0!=0{print $1/$2}'`
    if("$temp" != "") set volume = "$temp"

    # indicates P1 merge after P1_scales_SG scaling
    set printSG = "$SG"
    if("$P1_scales_SG" != "" && "$SG" == "P1" && "$P1_scales_SG" != "P1") then
	set printSG = "${P1_scales_SG}-P1"
    endif
    
    set Rmerge = `nawk '/ Overall /{line=$0;getline;if($0~/Rmerge/){print substr(line,index($0,"Rmerge"),6)}}' logs/merge.${printSG}.log | tail -1`
    set test = `echo "$Rmerge" | awk '{print ($1+0 == 0)}'`
    if($test ) then
        set Rmerge = `nawk '/ Overall: /{print $2}' logs/merge.${printSG}.log | tail -1`
    endif
    set test = `echo "$Rmerge" | awk '{print ($1+0 == 0)}'`
    if($test) then
        # use "Ranom" instead?
        set Rmerge = `nawk '/ Overall: /{print $5}' logs/merge.${printSG}.log | tail -1`
    endif
    set test = `echo "$Rmerge" | awk '{print ($1+0 == 0)}'`
    if(! $test) then
	echo -n "$SG " >> $tempfile
	echo -n "$Rmerge " >> $tempfile
	cat  logs/merge.${printSG}.log |\
	nawk '/Systematic absences are OMITTED/,/Distributions/' |\
	nawk 'NF==5 && ! /[a-z]/ && $5 != 0{print $4/$5}' |\
	sort -n | tail -1 >! $tempfile.absent
	set Absence = `tail -1 $tempfile.absent`
	echo "$Absence" >> $tempfile
    endif
end
rm -f ${tempfile}.absent >& /dev/null

# do some statistics
awk '{print $2"|"$0}' $tempfile | sort -n |\
nawk -F "|" '{print $2}' >! ${tempfile}sorted
set best_SG     = `nawk 'NF==2' ${tempfile}sorted | head -1 | nawk '$2+0>0.0{print $1}'`
set P1_Rmerge   = `nawk '$1=="P1"{print $2}' $tempfile`
set min_Rmerge  = `nawk 'NF==2' ${tempfile}sorted | head -1 | nawk '$2+0>0.0{print $2}'`
set max_Rmerge  = `nawk 'NF==2' ${tempfile}sorted | tail -1 | nawk '$2+0>0.0{print $2}'`
set mean_Rmerge = `nawk '$2+0>0{sum += $2;++count} END{if(count) print sum/count}' $tempfile`
set rmsd_Rmerge = `nawk -v avg=$mean_Rmerge '$2+0>0{sum+=($2-avg)^2;++count} END{if(count) print sqrt(sum/count)}' $tempfile`
rm -f ${tempfile}sorted >& /dev/null

# do we already have a "best" SG for obtaining the P1 baseline?
if("$P1_scales_SG" == "" && "$best_SG" != "P1" && "$best_SG" != "") then
    # probably always a good idea to try and use P1_scales_SGs scales, but merge with P1
    set P1_scales_SG = "$best_SG"
    set saved_SGs = ( $SGs )
    set SGs = P1
    echo "scaling with $P1_scales_SG to merge in P1"
    goto SGcheck
endif

set Rmerge_cutoff = ""

# see if P1 run worked
if("$P1_Rmerge" != "") then
    # P1 run succeeded, so "no symmetry" hypothesis has been considered
    if("$best_SG" == "P1") then
	# P1 had a lower Rmerge than anything else
	set Rmerge_cutoff = `echo $min_Rmerge 0.1 | nawk '{print $1+1.0*$2}'`
    else
	# P1 worked, but it was not the best score
 	set Rmerge_cutoff = `echo $min_Rmerge 0.1 | nawk '{print $1+1.0*$2}'`
    endif
else
    # P1 run failed, but we gotta have it
    # what do we do?
    set Rmerge_cutoff = 0.2
endif

set test = `echo $Rmerge_cutoff 0.5 | nawk 'NF==2{print ($1 > $2)}'`
if("$test" == "1") set Rmerge_cutoff = 0.5

# filter for SGs below the mergeing criteria
set SGs = `nawk -v max=$Rmerge_cutoff '$2+0 < max && $3+0 < 4 {print $1}' $tempfile`

if("$SGs" == "") then
    echo "WARNING: none of these space groups merged well."
    if("$P1_Rmerge" == "") echo "not even P1"
    echo "         try to tune up $script next time."
    exit 9
endif

# if $?PG, we have been here before
if($?PG) then
    if($#SGs == 1) then
	echo "Possible space group is: $SGs"
    else
	echo "Possible space groups are: $SGs"
    endif
#    echo "$SGs" >! SGs.txt
    rm -f $tempfile
    exit
endif
    
# extract protein-compatible space groups from CCP4 in order of increasing symmetry
#nawk '$5 ~ /^PG/ && ! /m/ && ! /bar/ && $1 < 500' $CLIBD/symop.lib >! $tempfile
cat ${CLIBD}/symop.lib |\
nawk '$5 ~ /^PG/ && ! /m/ && ! /bar/ && $1 < 500{\
        printf "%s ", $0; for(op=$2;op>0;--op){getline;printf $1 $2};print ""}' |\
nawk '{print length($NF), $0}' |\
sort -n |\
nawk '{print $2,$3,$4,$5,$6,$7}' >! $tempfile

# pursue all potential possibilites (but there should really only be one)
set temp = ""
echo "checking for screw axes in $SGs"
foreach SG ( $SGs )

    # get size of ASU (in A^3)
    set tempV = `nawk -v SG=$SG '$4==SG && $1 < 500{print $2}' $CLIBD/symop.lib`
    set tempV = `echo "$CELLvolume $tempV" | nawk '$2+0!=0{print $1/$2}'`
    if("$temp" != "") set volume = "$tempV"

    # get point group of $SG
    set PG = `nawk -v SG=$SG '$4==SG{print $5}' $tempfile`
    set latt = `echo $SG | nawk '{print substr($1,1,1)}'`
    
    # now get all space groups in $PG on $latt
    set temp =  "$temp "`nawk -v PG=$PG -v latt=$latt '$5==PG && substr($4,1,1)==latt{print $4}' $tempfile`

    # add pseudo-SGs for asymmetric orthorhombics
    if("$PG" == "PG222") then
	set pseudoSGs = ""
	foreach sg ( $temp )
	    set pseudoSGs = "$pseudoSGs $sg"
	    if("$sg" == "P2221") set  set pseudoSGs = "$pseudoSGs P2212 P2122"
	    if("$sg" == "P21212") set  set pseudoSGs = "$pseudoSGs P21221 P22121"
	end
	set temp = "$pseudoSGs"
    endif
end
rm -f $tempfile

set SGs = `echo $temp`

# check each of these

goto SGcheck


exit