#!/usr/local/bin/perl -w
# finding quantiles

# JaeSub Hong, 2003-2005, version 1.7
# Please report any problem or suggestion at jaesub@head.cfa.harvard.edu
# 
# Calculate quantiles from a distribution
# Refer to 
#	J. Hong, E.M. Schlegel & J.E. Grindlay, 
#	2004 ApJ 614 p508 and references therein
# 
# usage
#	quantile.pl -frac frac1,frac2,... 
#		-src src_file 
#		-range lower_limit:upper_limit 
#		[-bkg bkg_file 	-ratio ratio]
# examples
#	quantile.pl -src src.txt -range 0.3:8.0
#	quantile.pl -src src.txt -range 0.3:8.0 -bkg bkg.txt -ratio 0.2
#
# required input:
#	-frac: list of quantile fractions 
#		default: 0.25,0.33,0.5,0.67,0.75
#	-src : src file containing the list of values in the source region 
#		(e.g. energies of photons in the source region separated
#		by space, tab, or return)
#	-range :  the full range of values
#
# required input for bkgnd subtraction:
#	-bkg : bkg file containing the list of valuess in the bkgnd region 
#		(e.g. energies of photons in the bkgnd region)
#	-ratio : ration of the source to bkgnd region
#
# optional input
#	-nip : number of interplation points for bkgnd subtraction
#		the higher the better, but the slower
#		1/nip*range should be smaller than the resolution of the values
#		default 2000
#	-fixerror or -nofixerror : if requested, the following correction 
#		for error estimation is automatically applied
#		default :  -fixerror
#		1. when the error estimation fails, it normally returns
#			-1 for the error, but we can set the error to 
#			the whole range, which is reasonable.
#		2. when net<100, the error could be overestimated 
#			as much as 20-30% (See Fig.9 in Hong et al 2004)
#			a relatively moderate correction
#				x1./(1.+0.5/net) 
#			is applied to correct this.
#		3. correction for error due to bkgnd is applied 
#			sqrt(1+Nbkg/Nsrc) = sqrt(1.+(src-net)/net) (See Fig.9)
#		4. Error for QDy is likely overestimated
#			due to correlation of Q25/Q75, 
#			so x0.8 applied to compensate this 
# 		Refer to Hong et al 2004
#	-sort or -nosort: src and bkg will be sorted in ascending order, but
#		they are already sorted, you can skip the sorting procedure
#		by -nosort
#		default : -sort
#	-qccd or -noqccd: generate QCCD x and y values (QDx, QDy) 
#		default -qccd
#
# output
#	fraction, Ex%, error_Ex%, Qx, error_Qx
#		where Qx = (Ex% - Elo) / (Eup - Elo)
#
#	when requested,
#	QDx  (-error,+error)
#	QDy  (-error,+error)
#		where
#		QDx = log10 [m/(1-m)] where m=Q50
#		QDy = 3* Q25/Q75
#		error = 1 sigma
#
#

#----------------------------------------------------------------------
# get the parameter files from command line
use Getopt::Long;
GetOptions(
	"frac=s"	=> \$frac,
	"src=s"		=> \$src,
	"bkg=s"		=> \$bkg,
	"ratio=s"	=> \$ratio,
	"range=s"	=> \$range,
	"sort!"		=> \$sort,
	"fixerror!" 	=> \$fixerror,
	"qccd!" 	=> \$qccd,
	"nip=i"		=> \$nip,
	"help" 		=> \$help,
	"debug:i" 	=> \$debug);

$debug = 0 unless defined $debug;

if (defined $help
	|| !defined $src
	|| !defined $range
	) {
	($usage = <<"	EOFHELP" ) =~ s/^\t\t//gm;
		Find quantiles for given fractions
		usage: $0  
			-src file
			-range xx.x:yy.y
			[-bkg file [-ratio x.x]]
			[-frac 0.xx,0.yy,...]
			[-fixerror | -nofixerror]
			[-sort | -nosort]
			[-Nip xxxx]
			[-help] [-debug [X]] 
		options
			-frac     	interested fraction (def 0.25,0.33,0.5,0.67,0.75)
			-src      	src file
			-bkg      	bkg file
			-ratio    	ratio of src/bkg area (def 1.0)
			-range 	  	range of the values
			-[no]sort   	[don't] sort input values (def -sort)
			-[no]fixerror 	apply various corrections in error calc
				  	(def -fixerror)
			-[no]qccd     	generate QCCD x and y values (QDx, QDy)
			-nip      	number of energy point for interpolation, (def 2000)
					1/nip should be smaller than the resolution
			-help     	print this message
			-debug    	set the level of debug
	EOFHELP
	print $usage;
	exit;
}

$sort = 1 unless defined $sort;
$fixerror = 1 unless defined $fixerror;
$qccd = 1 unless defined $qccd;

$frac	= "0.25,0.33,0.5,0.67,0.75"  unless defined $frac;
@frac	= split/,/, $frac;
@frac 	= (0.25,0.50,0.75, @frac) if $qccd;
		# in case, user forget to request Q25, Q50, Q75
		# could be redundant, but no big deal since only 3 more elements
		# yeah right, 3 out of 8.

$ratio	= 1.0 unless defined $ratio;	
$nip	||= 10000;	# $nip > 0

$maxit 	= 100;
$eps 	= 3.0e-7;
$fpmin 	= 1.0e-30;

#----------------------------------------------------------------------
# get the range
($ll, $ul) = split/[:,]/, $range;
$rl = $ul-$ll;
die "wrong range: $ul < $ll\n" if $rl <= 0 ;
die "wrong ratio: $ratio < 0.0\n" if $ratio < 0.0 ;

# read source values
open(SRC, "< $src") || die "can't open $src\n";
@src = <SRC>; close(SRC);
foreach (@src) { chomp; s/\s+//mg; };
@src = grep { $_ >= $ll && $_ <= $ul} @src;

$n_src = $#src+1;

$n_bkg = 0;
$n_net = $n_src;

if ($n_net < 1.0) {
	@Ex 	= map { $_ * $rl+ $ll } @frac;
	@err_Ex = map { -1. } @frac;
	@err_Ex = fix_error(\@Ex, \@err_Ex, $ll, $ul) if $fixerror;
	result();
	exit;
}
@src = sort {$a <=> $b} @src if $sort;

# read bkgnd values if provided
if (defined $bkg) {
	open(BKG, "< $bkg") || die "can't open $bkg\n";
	@bkg = <BKG>; close(BKG);
	foreach (@bkg) { chomp; s/\s+//mg; };
	@bkg = grep { $_ >= $ll && $_ <= $ul} @bkg;
	$n_bkg = $#bkg+1;
} 

# print out the result for src_only case
if ($n_bkg <= 0) {
	@Ex 	= order_stat(\@frac, \@src, $ll, $ul);
	@err_Ex = error_mj  (\@frac, \@src, $ll, $ul);
	@err_Ex = fix_error (\@Ex,   \@err_Ex, $ll, $ul) if $fixerror;
	result();
	exit;
}

# now let's go for the case with bkgnd
@bkg = sort {$a <=> $b} @bkg if $sort;
$n_net = $n_src - $ratio * $n_bkg;

# when we don't have one net count
if ($n_net < 1.0) {
	@Ex 	= map { $_ * $rl+ $ll } @frac;
	@err_Ex = map { -1. } @frac;
	@err_Ex = fix_error(\@Ex, \@err_Ex, $ll, $ul) if $fixerror;
	result();
	exit;
}

# Now the real thing
for ($i=0;$i<$nip;$i++) {
	$inc = ($i+0.5)/$nip;
	push @ifrac, 	$inc;
	push @iE, 	$inc*$rl+$ll;
	push @nsrc, 	$inc*$n_src;
	push @nbkg, 	$inc*$n_bkg;
}

# get the quantiles at finely spaced frac for both src and bkg values
@iEx_src = order_stat(\@ifrac, \@src, $ll, $ul);
@iEx_bkg = order_stat(\@ifrac, \@bkg, $ll, $ul);

# remove the redundant platue values in the quantiles
($sa_nsrc, $sa_iEx_src) = simplify(\@nsrc, \@iEx_src);
($sa_nbkg, $sa_iEx_bkg) = simplify(\@nbkg, \@iEx_bkg);
@ic_src=interpol($sa_nsrc, $sa_iEx_src, \@iE);
@ic_bkg=interpol($sa_nbkg, $sa_iEx_bkg, \@iE);

# forward subtraction
foreach (@ic_src){
	$_ = 0.0  if $_ < 0.0;
	$_ = $n_src  if $_ > $n_src;
}
foreach (@ic_bkg){
	$_ = 0.0  if $_ < 0.0;
	$_ = $n_bkg  if $_ > $n_bkg;
}
$cur = $ic_src[0] - $ic_bkg[0] *$ratio;
$cur = 0.0  if $cur < 0.0;
$cur = $n_net  if $cur > $n_net;
push @ic_net, $cur;
push @ic_net_, 0.0;
for ($i=1;$i<$nip;$i++) {
	$cur = $ic_src[$i] - $ic_bkg[$i] * $ratio;
	$cur = $n_net  if $cur > $n_net;
	$cur = $ic_net[$i-1] if $cur < $ic_net[$i-1];
	push @ic_net, $cur;
	push @ic_net_, 0.0;
}

# backward subtraction
foreach (@ic_src){ $_ = $n_src - $_;}
foreach (@ic_bkg){ $_ = $n_bkg - $_;}
$cur = $ic_src[$nip-1] - $ic_bkg[$nip-1] * $ratio;
$cur = 0.0  if $cur < 0.0;
$cur = $n_net  if $cur > $n_net;
$ic_net_[$nip-1]=$cur;
for ($i=$nip-2;$i>=0;$i--) {
	$cur = $ic_src[$i] - $ic_bkg[$i] * $ratio;
	$cur = $n_net  if $cur > $n_net;
	$cur = $ic_net_[$i+1] if $cur < $ic_net_[$i+1];
	$ic_net_[$i]= $cur;
}

# average forward and backword
for ($i=0;$i<$nip;$i++) {
	push @net_frac, ($ic_net[$i]-$ic_net_[$i]+$n_net)/2./$n_net;
}
($sa_iE, $sa_net_frac) = simplify(\@iE, \@net_frac);
@Ex = interpol($sa_iE, $sa_net_frac, \@frac);

# regenerate photons to match @Ex at @frac
$n_net_ = sprintf("%d", $n_net+0.5);
for ($i=0; $i<$n_net_;$i++) { push(@tEx, ($i*2.+1.)/$n_net_/2.); }
@ip_src_ph = interpol($sa_iE, $sa_net_frac, \@tEx);

@err_Ex = error_mj(\@frac, \@ip_src_ph, $ll, $ul);
@err_Ex = fix_error(\@Ex,  \@err_Ex, $ll, $ul) if $fixerror;

result();
exit;

#----------------------------------------------------------------------
# sub routines

sub result{

	print "range  $ll $ul\n";
	print "source $n_src\n";
	print "bkgnd  $n_bkg\n";
	print "net    $n_net\n";
	print "ratio  $ratio\n";

	$sorted = ($sort)? "yes" : "no";
	$fixed = ($fixerror)? "yes" : "no";
	print "sort   $sorted\n";
	print "fixerr $fixed\n";

	print "fraction(x)   Ex%   error_Ex%     Qx       error_Qx\n";


	@Qx = map { ($_ - $ll)/$rl } @Ex;
	@err_Qx = map { $_ /$rl } @err_Ex; # be careful, no subtraction for err_Ex

	$io = ($qccd) ?  3 : 0;
	for ($i=$io;$i<=$#frac;$i++) {
		printf " %.3f  %10.3e %10.3e %10.3e %10.3e\n",
			$frac[$i],$Ex[$i],$err_Ex[$i],$Qx[$i],$err_Qx[$i];
	}

	exit unless $qccd;

	# if requested, now calculate QCCD x and y (QDx, QDy) and their errors

	$QDx = log($Qx[1]/(1.-$Qx[1]))/log(10.);
	$m_l = $Qx[1] - $err_Qx[1]; 
	$m_u = $Qx[1] + $err_Qx[1];

	$bigN = 1.e38; # big number for divergence, is this big enough or too big
	$err_QDx_l = ($m_l <= 0.0 || $err_Qx[1] < 0.0) ? 
			$bigN : $QDx-log($m_l/(1.-$m_l))/log(10.);
	$err_QDx_u = ($m_u >= 1.0 || $err_Qx[1] < 0.0) ? 
			$bigN : log($m_u/(1.-$m_u))/log(10.)-$QDx;

	$QDy = ($Qx[2] <= 0.0) ? 3. : 3.*$Qx[0]/$Qx[2];
	$err_QDy =  ($Qx[0] <= 0.0 || $err_Qx[0] < 0.0 || $err_Qx[2] < 0.0) ? 3. :
		$QDy * sqrt(($err_Qx[0]/$Qx[0])**2. + ($err_Qx[2]/$Qx[2])**2.);

	# 0.8 is correction factor #3.
	$ec3 = ($fixerror)? 0.8 : 1.0;

	$err_QDy_l = ($QDy - $err_QDy <= 0.0)? $QDy 	: $ec3 * $err_QDy;
	$err_QDy_u = ($QDy + $err_QDy >= 3.0)? 3.-$QDy 	: $ec3 * $err_QDy;

	printf "%s  %10.3e (-%9.3e, +%9.3e)\n","QDx", $QDx, $err_QDx_l, $err_QDx_u;
	printf "%s  %10.3e (-%9.3e, +%9.3e)\n","QDy", $QDy, $err_QDy_l, $err_QDy_u;
}

#----------------------------------------------------------------------
# The followings are based on the routines: betacf.c, betai.c and
#              gammln.c described in section 6.2 of Numerical Recipes,
#              The Art of Scientific Computing (Second Edition)
sub gammln{
	@cof = (76.18009172947146,-86.50532032941677,
		24.01409824083091,-1.231739572450155,
		0.1208650973866179e-2,-0.5395239384953e-5);

	my ($y) = @_;
	my $x = $y;
	my $tmp=$x+5.5;
	$tmp -= ($x+0.5)*log($tmp);
	my $ser=1.000000000190015;
	for ($j=0;$j<=5;$j++) { $ser += $cof[$j]/++$y;};
 	return -$tmp+log(2.5066282746310005*$ser/$x);
}

sub betacf{
	my ($a, $b, $x) = @_;

	my $qab=$a+$b; 
	my $qap=$a+1.0;
	my $qam=$a-1.0;

	my @ans=();
	my $c=1.0; 
	my $d=1.0-$qab*$x/$qap;

	if (abs($d) < $fpmin) {$d=$fpmin;};
	$d=1.0/$d;
	my $h=$d;
	for ($m=1;$m<=$maxit;$m++) {
		$m2=2*$m;
		$aa=$m*($b-$m)*$x/(($qam+$m2)*($a+$m2));
		$d=1.0+$aa*$d; 
		if (abs($d) < $fpmin) {$d=$fpmin;};
		$c=1.0+$aa/$c;
		if (abs($c) < $fpmin) {$c=$fpmin;};
		$d=1.0/$d;
		$h *= $d*$c;
		$aa = -($a+$m)*($qab+$m)*$x/(($a+$m2)*($qap+$m2));
		$d=1.0+$aa*$d; 
		if (abs($d) < $fpmin) {$d=$fpmin;};
		$c=1.0+$aa/$c;
		if (abs($c) < $fpmin) {$c=$fpmin;};
		$d=1.0/$d;
		$del=$d*$c;
		$h *= $del;
		last if abs($del-1.0) < $eps; 
	}
	printf "ERROR: a or b too big, or MAXIT too small in betacf\n"
		if $m > $maxit;
	return $h;
}

sub betai{
	my ($a, $b, @xx) = @_;

	my @ans=();
	foreach $x (@xx) {
		printf "Bad x in routine betai\n" if $x < 0.0 || $x > 1.0 ;
		if ($x == 0.0 || $x == 1.0)  { 
			$bt=0.0;
		} else { 
			$bt=exp(gammln($a+$b)-gammln($a)-gammln($b)
				+$a*log($x)+$b*log(1.0-$x));
		};
		if ($x < ($a+1.0)/($a+$b+2.0)){ 
			push(@ans, $bt*betacf($a,$b,$x)/$a);
		} else { 
			push(@ans, 1.0-$bt*betacf($b,$a,1.0-$x)/$b);
		}
	}
	return @ans;
}

#----------------------------------------------------------------------
# simple interpolation routines

sub value_locate {
	my ($x, $nx, $ilo, $ihi) = @_;
#	$ilo = 0;
#	$ihi = @$x -1;

	return $ihi if $nx >= $x->[$ihi]; 
	return $ilo if $nx <= $x->[$ilo]; 
	# make sure $nx >= $x->[$ilo] unless $ilo==0; or use $ilo==0;

	for (;;) {
    		my $middle = int(($ilo + $ihi)/2);
    		if ($middle == $ilo) { return $ilo; }
	    	if ($nx < $x->[$middle]) { $ihi = $middle; }
	    	else { $ilo = $middle; }
  	}

}

sub interpol{
	my ($y, $x, $nx) = @_;

	my $last = @$x-1;
	my @ans =();
	my $dy = 0.0, $ny = 0.0;
	my $j = 0, $k = 0;

	foreach $nx_ ( @{$nx} ) {
		$j = value_locate($x, $nx_, 0, $last);
		$j = $last-1 if $j >= $last;
		$k = $j + 1;
		if ($x->[$k] == $x->[$j]) {
			$ny = ($y->[$k]+$y->[$j])/2.;
		} else {
			$dy = ($y->[$k]-$y->[$j])/($x->[$k]-$x->[$j]);
			$ny = $dy*($nx_-$x->[$j]) + $y->[$j];
		}
		push (@ans, $ny);
	}
	return @ans;
}

sub simplify{
	my ($y, $x) = @_;
	my $last = @$x-1;

	my @sy = ($y->[0]);
	my @sx = ($x->[0]);
	
	my $strip = 0;
	foreach $i (1 .. $last) {
		$j = $i-1;
		if ($x->[$i] == $x->[$j]) {
			$strip = 1;
		} else {
			if ($strip == 1){
				push @sy, $y->[$j];
				push @sx, $x->[$j];
				$strip = 0;
			}
			push @sy, $y->[$i];
			push @sx, $x->[$i];
		}
	}
	return (\@sy, \@sx);
}

#----------------------------------------------------------------------
# quantile routines
sub order_stat{
	my ($frac, $values, $ll, $ul) = @_;
	my @Ex=();
	for (my $i=0;$i<@$values;$i++) {
		push @Ex, ($i*2.0+1.0)/2./@$values;
	}

	my @values_ = ($ll,@{$values},$ul);
	my @Ex_ = (0.0,@Ex,1.0);

	my ($sa_values_, $sa_Ex_) = simplify(\@values_, \@Ex_);
	my @ans = interpol($sa_values_,$sa_Ex_, $frac);
	return @ans;
}

# Error estimation by Maritz-Jarrett method
sub error_mj{
	my ($frac, $values, $ll, $ul) = @_;

	my @m=();
	my @a=();
	my @b=();

	my $hrange = ($ul-$ll)/2.;

	my $nvalues = @$values;
	for (my $i=0;$i<@$frac;$i++) {
 		my $m_ = ($frac->[$i])*($nvalues)+0.5; #int should be gone!!!

		push @m, $m_;
		push @a, $m_-1.0;
  		push @b, $nvalues-$m_;

	}

	my @i_src=();
	for ($i=0;$i<=$nvalues;$i++) {
   		push @i_src, $i/$nvalues;
	}
  	my @values_ = (@{$values});
	my @ans=();
	for ($i=0;$i<@$frac;$i++){
		if ($a[$i] <= 0.0 || $b[$i] <= 0.0) {
			push @ans, -1.0;
			next;
		}
		my @beta=betai($a[$i],$b[$i],@i_src);
		my ($c0, $c1, $c2) = (0.0, 0.0, 0.0);
		for (my $j=0;$j<=$#values_;$j++){
			$c0 = $values_[$j] * ($beta[$j+1] - $beta[$j]);
			$c1 += $c0;
			$c2 += $c0*$values_[$j];
		}
		$c0 = $c2-$c1*$c1;
		if ($c0 >= 0.0) {
			$c0 = sqrt($c0);
		} else {
			# to get around small number trouble
			if (abs($c0/$c2) < $eps*$eps) {
				$c0 = sqrt(abs($c0));
			} else {
				$c0 = -1.0;
			}
		}
		push @ans, $c0;
	}

	return @ans;
}


sub fix_error {
	my ($lEx, $lerr, $ll, $ul) = @_;

	my @ans=();
	for ($i=0;$i<@$lEx;$i++){
		$error = $ul - $lEx->[$i];
		$error_ = $lEx->[$i] - $ll;
		$error = $error_ if $error < $error_;
		if ($lerr->[$i] < 0.0 || $lerr->[$i] > $error) {
			push @ans, $error;
		} else {
			my $n_net_temp = ($n_net>1.0)? $n_net: 1.0;
			push @ans, 
				$lerr->[$i]/(1.+0.5/$n_net_temp)
				*sqrt(1.+($n_src-$n_net)/$n_net);
			# error correction #1, #2
		}
	}
	return @ans;

}