#!/usr/bin/perl -w 


system("ls ../jan2722/3_output_bin30/ > eucliddist");
open(IF,"eucliddist");
while($a=<IF>){
chomp($a);
open(IF2,"../jan2722/3_output_bin30/$a");
$head=<IF2>;
#02_dist_wtpb_y2h2pb_signedbin30.txt
@d = split(/_/,$a);
$key = "$d[2] $d[3]";
$key2 = "$d[3] $d[2]";
while($b=<IF2>){
chomp($b);
($gene,$euclid) = split(/\t/,$b);
$gene .= "_splicingValues.txt";
$euclidhash{$key}{$gene} = $euclid;
$euclidhash{$key2}{$gene} = $euclid;
#print "
#:$key:
#:$key2:
#$euclidhash{$key}{$gene} = $euclid;
#$euclidhash{$key2}{$gene} = $euclid;
#";
#gene	euc_dist
#YAL003W	0.0973344992455037
}
close(IF2);
}
#close(IF);

#exit;







$binsize=20;
#$step=30;
$posmin=-100;
$posmax=400;

$nbins = int($posmax/$binsize);

$pre = "/disk_aug18_a/jan2019/processed_data100/smit_files";

@dirs = qw(
hs1A_50
hs1B_51
hs1C_52
hs1D_53
);

@info = qw(wt wtpb y2h2 y2h2pb);

open(PROB1,">shortlist");

foreach $d (@dirs){
system("ls $pre/$d/*splicingValues.txt > tlist");
open(IF,"tlist");
while($a=<IF>){
chomp($a);
@cc = split(/\//,$a);
$gene = $cc[6];
$ghash{$gene}++;
#print "filling $d\n$gene\n";
$tposmax = 0;
open(IF2,$a);
$aa = <IF2>;
while($aa=<IF2>){
chomp($aa);
@bb = split(/\t/,$aa);
$pos = $bb[0]; $val = $bb[1]; $splraw = $bb[2] ; $splnorm = $bb[3];
#relPos	readSum	splicingValueRaw	splicingValueNorm
#-100	0	NA	0
#-99	0	NA	0
$splnorm =~ s/NA/0/g;
if($pos>=$posmin && $pos<=$posmax){
$tposmax = $pos;
$poshash{$d}{$gene}  .= "$pos,";
$valhash{$d}{$gene}  .= "$splnorm,";
#print "$d\n";print "$a\n";exit;
}
}
close(IF2);
if($tposmax<$posmax){print PROB1 "$d $gene $tposmax\n"; $gooddata{$d}{$gene}=0; }
else{$gooddata{$d}{$gene} = 1;}

}
close(IF);
}

system("rm pngs/*");

open(HF,">wilcoxonallpval.html");
$header = "<pre>
<table border=1>
<tr><td> comparison gene<td>info<td>Euclidean dist.<td>~area under curve<td>p-value<td> \n";
print HF $header ;

foreach $gene (sort keys %ghash){
for($i=0;$i<4;$i++){
$da = $dirs[$i];
$infoa = $info[$i];
$poshash{$da}{$gene} =~ s/,$//;
$valhash{$da}{$gene} =~ s/,$//;
for($j=$i+1;$j<4;$j++){
$db = $dirs[$j];
$infob = $info[$j];

$poshash{$db}{$gene} =~ s/,$//;
$valhash{$db}{$gene} =~ s/,$//;

if($gooddata{$da}{$gene} == 1 && $gooddata{$db}{$gene} == 1){

$euclid = 0;
@binneda = ();
@binnedb = ();
@alla = split(/,/,$valhash{$da}{$gene});
@allb = split(/,/,$valhash{$db}{$gene});
#print "@alla\n";
for($p=0;$p<$posmax;$p++){
$bb = int($p/$binsize);
#print "$bb\n";
$binneda[$bb] += $alla[$p]/$binsize;
$binnedb[$bb] += $allb[$p]/$binsize;
}
#exit;


#normalizing max to 1
$max=0;
for($p=0;$p<$nbins;$p++){
if($binneda[$p]>$max){$max = $binneda[$p];}
if($binnedb[$p]>$max){$max = $binnedb[$p];}
}

if($max>0){
for($p=0;$p<$nbins;$p++){
$binneda[$p] = $binneda[$p]/$max;
$binnedb[$p] = $binnedb[$p]/$max;
}
}






$bintexta = "";
$bintextb = "";
$postext = "";
$firstpos{$da}{$gene} = -1000;
$firstpos{$db}{$gene} = -1000;
$maxpos{$da}{$gene} = 0;
$max{$da}{$gene} = 0;
$maxpos{$db}{$gene} = 0;
$max{$db}{$gene} = 0;
$teuclid=0;
$nup=0;
for($p=0;$p<$nbins;$p++){
$bintexta .= "$binneda[$p],";
$bintextb .= "$binnedb[$p],";
#$teuclid += ($binneda[$p] - $binnedb[$p])**2;
$teuclid += $binneda[$p] - $binnedb[$p];
if($binneda[$p]>0 && $binnedb[$p]>0){
if($binneda[$p]>$binnedb[$p]){$nup++;}
else{$nup--;}
}
$tmpp = $binsize*$p - 100;
$postext .= "$tmpp,";

if($tmpp>0 && $binneda[$p]>0 && $firstpos{$da}{$gene} == -1000){$firstpos{$da}{$gene} = $tmpp;}
if($tmpp>0 && $binnedb[$p]>0 && $firstpos{$db}{$gene} == -1000){$firstpos{$db}{$gene} = $tmpp;}

if($tmpp>0 && $binneda[$p]>$max{$da}{$gene}){$max{$da}{$gene}=$binneda[$p];$maxpos{$da}{$gene}=$tmpp;}
if($tmpp>0 && $binnedb[$p]>$max{$db}{$gene}){$max{$db}{$gene}=$binnedb[$p];$maxpos{$db}{$gene}=$tmpp;}

}
#$teuclid = sqrt($teuclid);
#if($nup<0){$teuclid = -1 * $teuclid;}
#find position of half max

$halfmaxpos{$da}{$gene} = -1000;
$halfmaxpos{$db}{$gene} = -1000;
for($p=0;$p<$nbins;$p++){
$tmpp = $binsize*$p - 100;

if($tmpp>0 && $binneda[$p]>(($max{$da}{$gene})/2)  && $halfmaxpos{$da}{$gene} == -1000){$halfmaxpos{$da}{$gene} = $tmpp;}
if($tmpp>0 && $binnedb[$p]>(($max{$db}{$gene})/2)  && $halfmaxpos{$db}{$gene} == -1000){$halfmaxpos{$db}{$gene} = $tmpp;}

}

$postext =~ s/,$//;
$bintexta =~ s/,$//;
$bintextb =~ s/,$//;


open(RF,">tmp.R");
print RF "

dfa <- data.frame(x=c($postext) , y=c($bintexta) )
dfb <- data.frame(x=c($postext) , y=c($bintextb) )

png(\"pngs/$da.$gene.png\")
plot(dfa\$x, dfa\$y, type='l', col='red' , ylim=c(0,1) )

png(\"pngs/$db.$gene.png\")
plot(dfb\$x, dfb\$y, type='l', col='red' , ylim=c(0,1) )

png(\"pngs/$da.$db.$gene.png\")
plot(dfa\$x, dfa\$y, type='l', col='red' , ylim=c(0,1) )
lines(dfb\$x, dfb\$y, type='l', col='blue')
legend('topleft',inset=0.05,c(\"$da\",\"$db\"),lty=1,col=c(\"red\",\"blue\"))

wilcox.test(dfa\$y, dfb\$y , exact=FALSE, paired=TRUE )
wilcox.test(dfa\$y, dfb\$y , exact=FALSE, paired=TRUE)\$p.value

";
system("R --vanilla < tmp.R > tmp.out");

open(TO,"tmp.out");
while($gg=<TO>){
if($gg =~ /p-value/){
chomp($gg);
@hh = split(/ /,$gg);
$pvalue = $hh[5];
}
}
close(TO);
$key = "$infoa $infob";
#print "$key $gene\n";
#intronless genes ?
if(exists $euclidhash{$key}{$gene}){

if($pvalue eq 'NA'){$euclidhash{$key}{$gene} = "";}
else{
$line = "<tr><td>$da.$db.$gene<td>$key<td>$euclidhash{$key}{$gene}<td>$teuclid<td>$pvalue<td><img height=240 src=\"pngs/$da.$db.$gene.png\"> \n";
#$line = "<tr><td>$da.$db.$gene<td>$key<td>$pvalue<td><img height=240 src=\"pngs/$da.$db.$gene.png\"> \n";
$bighash{"$infoa $infob"} .= $line;
print HF $line; 
}

}
}
}
}


}
close(HF);


for($i=0;$i<4;$i++){
$infoa = $info[$i];
for($j=$i+1;$j<4;$j++){
$infob = $info[$j];
open(OF,">weuclid.$infoa.$infob.html");
print OF $header;
print OF $bighash{"$infoa $infob"};
close OF;
}}

exit;

open(SFM,">singlefirsthalfmax.html");
open(SFMTSV,">singlefirsthalfmax.tsv");
print SFM "
<pre>
<table border=1>
<tr><td>gene \n";
print SFMTSV "gene";
for($i=0;$i<4;$i++){
$da = $dirs[$i];
$infoa = $info[$i];
print SFM "<td> $da $infoa<td>first_pos,halfmax_pos,max_pos,maxvalue";
print SFMTSV "\t$da\_$infoa\_first_pos\t$da\_$infoa\_halfmax_pos\t$da\_$infoa\_max_pos\t$da\_$infoa\_maxvalue";}

print SFM "\n";
print SFMTSV "\n";

foreach $gene (sort keys %ghash){

$line = "<tr><td>$gene";
$tsvline = "$gene";
for($i=0;$i<4;$i++){
$da = $dirs[$i];
$infoa = $info[$i];
if(exists $firstpos{$da}{$gene} && $firstpos{$da}{$gene}>0 ){
$line .= "<td><img height=240 src=\"pngs/$da.$gene.png\"><td>$firstpos{$da}{$gene},$halfmaxpos{$da}{$gene},$maxpos{$da}{$gene},$max{$da}{$gene} ";
$tsvline .= "\t$firstpos{$da}{$gene}\t$halfmaxpos{$da}{$gene}\t$maxpos{$da}{$gene}\t$max{$da}{$gene} ";
}
else{$line .= "<td><td>";
$tsvline .= "\t\t\t\t";
}
}

if($line =~ /img/){
print SFM $line;
print SFM "\n";
print SFMTSV $tsvline;
print SFMTSV "\n";
}


}

