#!/bin/sh
VERSION=3.1.2
LICENSE="Copyright (C) 2003-2007, 2009, 2011-2013, 2017 Dimitar Ivanov

License: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law."
################################################################################
#
# maphimbu - all purpose histogram builder for numerical and text data in 1-d
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
################################################################################
#
MYNAME=`basename $0`
RC=0
DX=0
XAX=1
YAX=2
CX=0
SU=0
MV=0
PFORM="%d"
SORT=cat
SORT_OPT="-"
COUNT_COL=2
QUANTILES_NR=10
pr_dnumform="%15.15g"           # Print format of the numerical result

### We need an AWK supporting assignments of variables
exec 3>&2 2>&-
for a in awk gawk nawk
do
  [ "`echo |$a -v a=a '{}' 2>&1`" = "" ] && AWK=$a && break
done
exec 2>&3
[ "x$AWK" = x ] && \
  echo "Error: can't find 'awk' supporting assignments" && \
  exit 2

### Do sort correctly any data including numbers
#
_sort_data_on_demand_()
{
   if [ x$SORT_SEP = x ]; then
        $SORT $SORT_OPT
   else
            # Prepare the data in order to sort correctly floats, then do sort,
            # and finally remove the first column used for the numerical sorting
        $AWK "{ printf(\"%.16f $SORT_SEP%s\n\", \$$1, \$0) }" \
        | $SORT $SORT_OPT \
        | cut -d$SORT_SEP -f2-
   fi
}

_if_arg_()
{
  c2=$(printf "%.2s" "-$1")
  [ "$c2" = "--" ] \
  && return 1 \
  || return 0
}

_setup_numerical_sorting_()
{
  SORT=sort
  SORT_OPT="-nk1 $SORT_OPT"
  SORT_COL="$1"
  SORT_SEP=:
}

_setup_accumulation_()
{
   [ $ACC_COL ] || ACC_COL=2
   ACC_PROG=mintegrate
   which $ACC_PROG 2>&1 |grep -q "^\/" \
         || { 
              echo "Error: can't find program '$ACC_PROG' in PATH"
              exit 3
            }
}

_accumulate_result_()
{
  [ "$ACC_PROG" ] || { cat ; return 0; }
  [ "$pr_dnumform" ] && pr_dnumform="-p $pr_dnumform" 
  $ACC_PROG -S -y $1 $pr_dnumform
}

_setup_normalized_counting_()
{
  NORM=$1
  ACC_COL=3
  UN_HEAD="NORMALIZED_COUNTS"
}

### Compute quantiles of the underlying probability distribution:
#   https://en.wikipedia.org/wiki/Quantile
#
_print_only_quantils_()
{
  if [ $QUANTILES ]; then
       $AWK -v quants=$QUANTILES -v q_col=$QUANT_COL -v count_col=$COUNT_COL \
            'BEGIN { CONVFMT = "%.17g" ; OFMT    = "%.17g" ;
               nrq = split( quants, q_val, "," )
               if      ( nrq == 1 && quants >= 1 ) {
                   ddq = 1/quants
                   nrq = quants + 1
                   for( j=1 ; j <= nrq; j++ )
                         q_val[j] = j*ddq
               } else if( nrq == 1 && quants < 1 ) {
                   nrq = 2
                   q_val[1] = quants
                   q_val[2] = 1
               } else  {              # After successful split 
                   nrq++
                   q_val[nrq] = 42.0  # Just a number higher than 1
               }
               for( j=1 ; j <= nrq ; j++ ) {
                    q_prob[j] = 0
                    q_last_prob[j] = 0
               }
               j_0 = 1
             }
             COMP_QUANTILES
             {
               nr_rec += $count_col
               for( j=j_0 ; j <= nrq ; j++ ) {
                    if( $q_col >= q_val[j] && $q_col < q_val[j+1] ) {
                        if( q_prob[j] == 0 ) { # if no data record
                            q_prob[j] = $q_col
                            q_prob_rec[j] = $0
                            q_prob_nr_rec[j] = nr_rec
                            j_0++
                            break
                        }
                    }
                    q_last_prob[j] = $q_col  # Keep the last value of P(x)
               }
             }
             END {
                   # Guarantee that the Quantile(1.0) will be printed regardless
                   # of the output format - in this case the last entry is
                   # always equal to 1.0!
               # if( q_val[nrq-1] == 1 && q_prob[nrq-1] == 0 ) {
               if( q_val[nrq-1] == 1 ) {
                   q_prob_rec[nrq-1] = $0
                   q_prob_nr_rec[nrq-1] = nr_rec
               }
               for( j=1 ; j< nrq ; j++ ) {
                    if( q_prob_rec[j] )
                        printf( "%s   %15d\n", q_prob_rec[j], q_prob_nr_rec[j] )
               }
             }'
  else
       cat
  fi
}

### Show usage
#
show_usage()
{
d=$2
sep=`echo |awk '{printf( "%080s", 0 )}' |tr 0 "$2"`

cat << !EOH
$sep
$1
$sep

Usage: $MYNAME [OPTION]... [FILE] 

Options:
   -x <int>       $d x-data column (default is $XAX)
   -y <int>       $d y-data column (default is $YAX);
                    It is considered only when combined with '-m' or '-S'
   -d <float|int> $d delta size (dx-resolution) in case of numerical data;
                    Keep in mind that 2 is not the same as 2.0: in the
                    first case the numbers are treated as integers, in
                    the latter as floats.
   -n             $d normalize the result
                    A) non-numerical data: n(x) = counts(x)/sum_of_counts
                    B) numerical data: the normalized probability density func.
                       is P(-INF < x <INF) = Int_[-INF:INF] P(x)*dx = 1 ;
                       For the numerical integration an open x-data interval
                       is assumed.
   -N             $d normalize data against the total sum of counts
                    (same as option '-n' with case A forced)
   -U             $d normalize the result with x-range mapped to unit;
                    The x-values are mapped to the interval [0:1],
                    respectively Int_[0:1] P(x)*dx = 1; This option is
                    mutually exclusive with '-n'.
   -S             $d compute sums and normalized sums of y-values;
                    The total sum of all y-values is used for the 
                    normalization.
   -m             $d compute the arithmetic average of the y-values
                    in the underlying x-bin
   -a             $d compute accumulated sums of the result data column
                    (default is 2, or in case of normalization column 3);
                    The program 'mintegrate' is need in this case.
   -Q [int]       $d compute quantiles of the resulting distribution function;
                    Options '-N', '-a' and '-g1' are implied and also the
                    program 'mintegrate' is need in this case. The argument
                    is optional and can be an integer (default is $QUANTILES_NR)
                    or a list of floating numbers e.g. "0.5,0.9,0.95".
                    For further statistical understanding refer to
                    https://en.wikipedia.org/wiki/Quantile
   -q [int]       $d compute quantiles of the input data (see also '-Q');
                    It is assumed that the data input is prepared by using
                    the output of '$MYNAME -Ng1'. The program 'mintegrate' is
                    also needed. Here the argument is optional (default is 4)
                    and defines the data column number of the accumulated
                    normalized partial sums by which the quantiles are computed.
                    Further, the column numbers (with defaults 2 and 3) of the
                    data counts and the normalized counts can be defined by
                    using the options '-y', and '-x' respectively. By providing
                    option '-Q' with an argument, the quantiles to be printed
                    are specified then.
   -s <int>       $d sort output using the specified column;
                    Header is skipped in this case.
   -g <int>       $d sort output numerically using the specified column;
                    Header is skipped in this case.
   -r             $d do reverse sorting (in combination with '-g' and '-s')
   -p <str>       $d print format of the numerical result ("$pr_dnumform" is default)
   -C             $d center histogram bins (shifts x-axis by 0.5*dx)
   -X             $d treat whole line as one string (single data record)
   -H             $d print a data description header
   -V             $d print program version and exit
   --version      $d output version and copyright information
   --help         $d display help
   -h             $d display short help (options summary)


Note: this software is not intended for use in high-precision computations.


Report bugs to <gnu@mirendom.net>

!EOH
}

### Show version
#
show_version()
{
     cat << !ver
$MYNAME $VERSION
$LICENSE
!ver
}

[ "$1" = "--version" ] && show_version && exit
[ "$1" = "--help" ] && show_usage "This program produces 1-d histograms from numerical or text data input. It can be also used to estimate the probability distribution function of a numerical variable - see http://en.wikipedia.org/wiki/Probability_density_function." " " && exit

################################################################################
#
# MAIN
#
opts="aChHnNUmd:x:Xy:Vp:rs:g:SQ::q::"
opts=`getopt $opts $*` || { opts="-h"; RC=1; }
set -- $opts

while [ $# -gt 0 ]
do
   case $1 in
        -h) show_usage "$MYNAME $VERSION: histogram builder for 1-d numerical and text data" "-" |egrep "^($MYNAME|Usage:|Options:|Note:|\ *-)"
            exit $RC
            ;;
        -d) DX=$2  
               # Try to find out the floating point separator
            FSEP="`echo $DX |tr -d '[0-9]'`"
            [ -n "$FSEP" ] && FLOAT="0${FSEP}5"
            shift 2
            ;;
        -n) _setup_normalized_counting_ 1 ACC_COL
            shift 1
            ;;
        -N) _setup_normalized_counting_ 2 ACC_COL
            shift 1
            ;;
        -U) _setup_normalized_counting_ 3 ACC_COL
            UN_HEAD="UNIT_X dx=%-.5g        UNIT_COUNTS"
            shift 1
            ;;
        -a) _setup_accumulation_
            shift 1
            ;;
        -q) [ "$QUANTILES" ] || QUANTILES=$QUANTILES_NR
            QUANTILIZED=yes
            COUNT_COL=$YAX
            ACC_COL=$XAX
            [ $ACC_COL -eq 1 ] \
            && ACC_COL=$(expr $COUNT_COL + 1)
            _setup_accumulation_
             QUANT_COL=$(expr $ACC_COL + 1)
            _if_arg_ "$2" && { QUANT_COL=$2 && shift 1 ; }
            shift 1
            ;;
        -Q) [ "$QUANTILES" ] || QUANTILES=$QUANTILES_NR
            _if_arg_ "$2" && { QUANTILES=$2 && shift 1 ; }
            _setup_numerical_sorting_ 1
            _setup_normalized_counting_ 2 ACC_COL
            _setup_accumulation_
            QUANT_COL=$(expr $ACC_COL + 1)
            shift 1
            ;;
        -x) XAX=$2
            shift 2
            ;;
        -X) XAX=0
            shift 1
            ;;
        -y) YAX=$2
            shift 2
            ;;
        -m) MV=1
            MV_HEAD="   Y_MEAN_VALUE   Y_STD_DEV"
            shift 1
            ;;
        -S) SU=1
            MV_HEAD="   Y_SUM   NORMALIZED_Y_SUM"
            shift 1
            ;;
        -g) _setup_numerical_sorting_ $2
            shift 2
            ;;
        -s) SORT=sort
            SORT_OPT="-nk$2 $SORT_OPT"
            shift 2
            ;;
        -r) SORT=sort
            SORT_OPT="-r $SORT_OPT"
            shift 1
            ;;
        -p) pr_dnumform="$2"
            shift 2
            ;;
        -C) CX=0.5
            shift 1
            ;;
        -H) HEAD=1
            shift 1
            ;;
        -V) echo $VERSION
            exit
            ;;
        --) shift 1
            break
            ;;
   esac
done

   # Remove header if sorting requested
[ $SORT != cat ] && { HEAD= ; UN_HEAD= ; MV_HEAD= ; }

### MAIN ###
#
if [ "$QUANTILIZED" = "yes" ]; then
     cat $1 \
     | _accumulate_result_ $ACC_COL \
     | _print_only_quantils_ $QUANT_COL $COUNT_COL
exit 0
fi

cat $1 \
| $AWK -v dx=$DX -v x=$XAX -v y=$YAX -v cx=$CX -v su=$SU -v mv=$MV \
              -v head=$HEAD -v unhead="$UN_HEAD" -v mvhead="$MV_HEAD" \
              -v norm=$NORM -v float=$FLOAT -v pr_dnumform=$pr_dnumform \
'BEGIN \
{
    CONVFMT = "%.17g"
    OFMT    = "%.17g"
        # User specified format for the print out of the result
    if( pr_dnumform ) OUT_form = pr_dnumform;
    else              OUT_form = OFMT;

    if( dx != 0 ) { one_dx=1/dx }
    else          { one_dx=1      }
}
{
    bin=$x
    if( dx != 0 ) {
           # Round up for negative and positive numbers is different
        if( bin >= 0 ) half=+float
        else           half=-float
           # Round up and take the integer part
        bin=sprintf( "%d", (bin*one_dx)+half )
           # Multiplication by 1 is necessary to attain '-0' to be simply '0'
        bin=1*bin
    }
    hits[bin]++
    if( hits[bin] == 1 ) { nrh++; name[nrh]=bin }

       # Mean values or y-sums
    if( mv || su ) { ysum[bin]+=$y ; ysum2[bin]+=($y*$y) ; ysumT+=$y }

       # Check for min and max x-value if normalization chosen:
       # min and max are INTEGERS if dx!=0
    if( norm ) {
        if( ! counts )       { max=min=bin }
        if( bin > max )      { max=bin }
        else if( bin < min ) { min=bin }
    }

    counts++
}
END \
{
    if( !counts ) exit

        # On request normalize only against the number of counts - ignore dx!
    if( norm == 2 ) { one_dx=1 }
    one_sum=one_dx/counts

    if( mv || su ) { one_ysumT=one_dx/ysumT }

       # Calculate the width of the histogram (x-range)
    if( norm == 3 ) {
        if( dx == 0 ) { dxl=1  }
        else          { dxl=dx }
        x0=min*dxl
           # Found out the width of the x-range
        xrange=(max-min)*dxl
        if( xrange == 0 ) xrange=1
        one_xrange=1/xrange
           # Now rescale dxl by the x-data width
        dxl=dxl*one_xrange
           # Rescale also one_sum=1/dx/counts
        one_sum=one_sum*xrange
           # Evaluated the header to print out the rescaled dx
        unhead = sprintf( unhead, dxl );
    }
       # Print data header
    if( head ) {
        if( unhead != "" )
            printf("#%17s   %15s   %17s%24s\n", "X", "COUNTS", unhead, mvhead)
        else
            printf("#%17s   %15s   %24s\n", "X", "COUNTS", mvhead)
    }
       # Main data loop
    while( nrh > 0 ) {
           x_i=bin=name[nrh]
               # Shift x (possible only with numerical data)
           if( cx )      { x_i=sprintf( OUT_form, bin+cx ) }
               # Normalize for dx!=0
           if( dx != 0 ) { x_val=sprintf( OUT_form, x_i*dx ) }
           else          { x_val=x_i }
               # Print x-value and its count
           printf( "%18s   %15d", x_val, hits[bin] )
               # Print normalized x-value as mapped in the [0-1] range
           if( norm == 3 ) {
               printf( "   "OUT_form, (x_val-x0)*one_xrange )
           }
               # Print an additional column with normalized histogram data
           if( norm ) {
               printf( "   "OUT_form, hits[bin]*one_sum )
           }
               # Compute averages/sums and print additional columns
           if( mv || su ) {
               if( mv ) {
                      # Mean value of the y-variable: the used computation
                      # method can introduce large round-off errors
                   mean=ysum[bin]/hits[bin]
                   mean2=mean*mean
                   sigma2=ysum2[bin]/hits[bin]-mean2
                   sigma=sqrt(sigma2)
               } else {
                      # Normalized sum of y-values
                   mean=ysum[bin]
                   sigma=one_ysumT*mean
               }
               printf( "   "OUT_form"   "OUT_form, mean, sigma )
           }
           printf("\n")
           nrh--
    }
}' \
| _sort_data_on_demand_ $SORT_COL \
| _accumulate_result_ $ACC_COL \
| _print_only_quantils_ $QUANT_COL $COUNT_COL

exit 0
