#!/usr/bin/perl -w 
open(IF,"sgdlookup");

##sacCer3.sgdGene.name	sacCer3.sgdDescription.name	sacCer3.sgdToName.name	sacCer3.sgdToName.value	uniProt.description.acc
#YAL012W	YAL012W	YAL012W	CYS3	P31373
$a=<IF>;
while($a=<IF>){
@b = split(/\t/,$a);
$gene = $b[0];
$nice{$gene} = $b[3];
}


system("ls ../jan2722/3_output_bin30/ > eucliddist");
open(IF,"eucliddist");
while($a=<IF>){
chomp($a);
open(IF2,"../jan2722/3_output_bin30/$a");
$head=<IF2>;
#02_dist_wtpb_y2h2pb_signedbin30.txt
@d = split(/_/,$a);
$key = "$d[2] $d[3]";
$key2 = "$d[3] $d[2]";
while($b=<IF2>){
chomp($b);
($gene,$euclid) = split(/\t/,$b);
#$gene .= "_splicingValues.txt";
$euclidhash{$key}{$gene} = $euclid;
$euclidhash{$key2}{$gene} = $euclid;
#print "
#:$key:
#:$key2:
#$euclidhash{$key}{$gene} = $euclid;
#$euclidhash{$key2}{$gene} = $euclid;
#";
#gene	euc_dist
#YAL003W	0.0973344992455037
}
close(IF2);
}
close(IF);


#gene	firstpos.hs1A_50	maxpos.hs1A_50	maxvalue.hs1A_50	halfmaxpos.hs1A_50	firstpos.hs1B_51	maxpos.hs1B_51	maxvalue.hs1B_51	halfmaxpos.hs1B_51	firstpos.hs1C_52	maxpos.hs1C_52	maxvalue.hs1C_52	halfmaxpos.hs1C_52	firstpos.hs1D_53	maxpos.hs1D_53	maxvalue.hs1D_53	halfmaxpos.hs1D_53
#YAL003W	1	59	1	33	1	62	1	9	3	64	1	13	1	64	1	13
#YGR183C	277	285	1	60	277	288	1	276	279	288	1	279	286	783	1	63
open(IF,"firstmaxhalf.txt");
$head=<IF>;
while($a=<IF>){
chomp($a);
@b = split(/\t/,$a);

$gene = $b[0];
$starts[0] = $b[1];
$starts[1] = $b[5];
$starts[2] = $b[9];
$starts[3] = $b[13];
@sorted = sort(@starts);
print "$gene $sorted[0] $sorted[3]\n";
$start{$gene} = $sorted[0];
}
close(IF);






$binsize=20;
#$step=30;
$posmin=-100;


$pre = "/disk_aug18_a/jan2019/processed_data100/smit_files";

@dirs = qw(
hs1A_50
hs1B_51
hs1C_52
hs1D_53
);

#references
#hs1A_50
#hs1C_52

@info = qw(wt wtpb y2h2 y2h2pb);

open(PROB1,">shortlist");

foreach $d (@dirs){
system("ls $pre/$d/*splicingValues.txt > tlist");
open(IF,"tlist");
while($a=<IF>){
chomp($a);
@cc = split(/\//,$a);
$gene = $cc[6];
$gene =~ s/_splicingValues.txt//;
$ghash{$gene}++;
if(exists $start{$gene}){
$thismax = int(($start{$gene} + 200 + 20)/20) * 20 ;
$nbins = int((100+$thismax)/$binsize);
#print "filling $d\n$gene\n";
$tposmax = 0;
open(IF2,$a);
$aa = <IF2>;
while($aa=<IF2>){
chomp($aa);
@bb = split(/\t/,$aa);
$pos = $bb[0]; $val = $bb[1]; $splraw = $bb[2] ; $splnorm = $bb[3];
#relPos	readSum	splicingValueRaw	splicingValueNorm
#-100	0	NA	0
#-99	0	NA	0
$splnorm =~ s/NA/0/g;
if($pos>=$posmin && $pos<=$thismax){
$tposmax = $pos;
$poshash{$d}{$gene}  .= "$pos,";
$valhash{$d}{$gene}  .= "$splnorm,";
#print "$d\n";print "$a\n";
}
}
close(IF2);
if($tposmax<$thismax){print PROB1 "$d $gene $tposmax\n"; $gooddata{$d}{$gene}=0; }
else{$gooddata{$d}{$gene} = 1;}
}
}
close(IF);
}

system("rm pngs/*");

open(HF,">wilcoxonallpval.html");
$header = "<pre>
<table border=1>
<tr><td> comparison gene<td>alt_name<td>info<td>~area under curve<td>p-value<td> \n";
print HF $header ;

foreach $gene (sort keys %ghash){
if(exists $start{$gene}){
$thismax = int(($start{$gene} + 200 + 20)/20) * 20 ;
print "$gene $thismax\n";
for($i=0;$i<4;$i++){
$da = $dirs[$i];
$infoa = $info[$i];
$poshash{$da}{$gene} =~ s/,$//;
$valhash{$da}{$gene} =~ s/,$//;
for($j=$i+1;$j<4;$j++){
$db = $dirs[$j];
$infob = $info[$j];

$poshash{$db}{$gene} =~ s/,$//;
$valhash{$db}{$gene} =~ s/,$//;

if($gooddata{$da}{$gene} == 1 && $gooddata{$db}{$gene} == 1){

$euclid = 0;
@binneda = ();
@binnedb = ();
@alla = split(/,/,$valhash{$da}{$gene});
@allb = split(/,/,$valhash{$db}{$gene});
#print "@alla\n";
for($p=0;$p<(100+$thismax);$p++){
$bb = int($p/$binsize);
#print "$bb\n";
$binneda[$bb] += $alla[$p]/$binsize;
$binnedb[$bb] += $allb[$p]/$binsize;
}


$nbins = int((100+$thismax)/$binsize);

$bintexta = "";
$bintextb = "";
$postext = "";
$firstpos{$da}{$gene} = -1000;
$firstpos{$db}{$gene} = -1000;
$maxpos{$da}{$gene} = 0;
$max{$da}{$gene} = 0;
$maxpos{$db}{$gene} = 0;
$max{$db}{$gene} = 0;
$tscore = 0;
$tscorea = 0;
$tscoreb = 0;
$tfactor = 0;
$nup=0;
for($p=0;$p<$nbins;$p++){
$bintexta .= "$binneda[$p],";
$bintextb .= "$binnedb[$p],";
#$tscore += ($binneda[$p] - $binnedb[$p])**2;
$tscore += $binneda[$p] - $binnedb[$p];
$tscorea += $binneda[$p] ;
$tscoreb += $binnedb[$p];

if($da =~ /hs1A_50/ || $da =~ /hs1C_52/){$tfactor += $binneda[$p];}
else{$tfactor += $binnedb[$p];}

if($binneda[$p]>0 && $binnedb[$p]>0){
if($binneda[$p]>$binnedb[$p]){$nup++;}
else{$nup--;}
}
$tmpp = $binsize*$p - 100;
$postext .= "$tmpp,";

if($tmpp>0 && $binneda[$p]>0 && $firstpos{$da}{$gene} == -1000){$firstpos{$da}{$gene} = $tmpp;}
if($tmpp>0 && $binnedb[$p]>0 && $firstpos{$db}{$gene} == -1000){$firstpos{$db}{$gene} = $tmpp;}

if($tmpp>0 && $binneda[$p]>$max{$da}{$gene}){$max{$da}{$gene}=$binneda[$p];$maxpos{$da}{$gene}=$tmpp;}
if($tmpp>0 && $binnedb[$p]>$max{$db}{$gene}){$max{$db}{$gene}=$binnedb[$p];$maxpos{$db}{$gene}=$tmpp;}

}
#$tscore = sqrt($tscore);
#if($nup<0){$tscore = -1 * $tscore;}
#find position of half max

$halfmaxpos{$da}{$gene} = -1000;
$halfmaxpos{$db}{$gene} = -1000;
for($p=0;$p<$nbins;$p++){
$tmpp = $binsize*$p - 100;

if($tmpp>0 && $binneda[$p]>(($max{$da}{$gene})/2)  && $halfmaxpos{$da}{$gene} == -1000){$halfmaxpos{$da}{$gene} = $tmpp;}
if($tmpp>0 && $binnedb[$p]>(($max{$db}{$gene})/2)  && $halfmaxpos{$db}{$gene} == -1000){$halfmaxpos{$db}{$gene} = $tmpp;}

}

$postext =~ s/,$//;
$bintexta =~ s/,$//;
$bintextb =~ s/,$//;


open(RF,">tmp.R");
print RF "

dfa <- data.frame(x=c($postext) , y=c($bintexta) )
dfb <- data.frame(x=c($postext) , y=c($bintextb) )

png(\"pngs/$da.$gene.png\")
plot(dfa\$x, dfa\$y, type='l', col='red' , ylim=c(0,1) )

png(\"pngs/$db.$gene.png\")
plot(dfb\$x, dfb\$y, type='l', col='red' , ylim=c(0,1) )

png(\"pngs/$da.$db.$gene.png\")
plot(dfa\$x, dfa\$y, type='l', col='red' , ylim=c(0,1) )
lines(dfb\$x, dfb\$y, type='l', col='blue')
legend('topleft',inset=0.05,c(\"$da\",\"$db\"),lty=1,col=c(\"red\",\"blue\"))

wilcox.test(dfa\$y, dfb\$y , exact=FALSE, paired=TRUE )
wilcox.test(dfa\$y, dfb\$y , exact=FALSE, paired=TRUE)\$p.value

";
system("R --vanilla < tmp.R > tmp.out");

open(TO,"tmp.out");
while($gg=<TO>){
if($gg =~ /p-value/){
chomp($gg);
@hh = split(/ /,$gg);
$pvalue = $hh[5];
}
}
close(TO);
$key = "$infoa $infob";
#print "$key $gene\n";
#intronless genes ?
if(exists $euclidhash{$key}{$gene}){

if($pvalue eq 'NA'){$euclidhash{$key}{$gene} = "";}
else{
$tscore = $tscore/$tfactor;
$line = "<tr><td>$da.$db.$gene<td>$nice{$gene}<td>$key<td>$tscore<td>$pvalue<td><img height=240 src=\"pngs/$da.$db.$gene.png\"> \n";
#$line = "<tr><td>$da.$db.$gene<td>$key<td>$pvalue<td><img height=240 src=\"pngs/$da.$db.$gene.png\"> \n";
$bighash{"$infoa $infob"} .= $line;
print HF $line; 
}

}
}
}
#exit;
}

}
}
close(HF);

#exit;

for($i=0;$i<4;$i++){
$infoa = $info[$i];
for($j=$i+1;$j<4;$j++){
$infob = $info[$j];
open(OF,">weuclid.$infoa.$infob.html");
print OF $header;
print OF $bighash{"$infoa $infob"};
close OF;
}}


open(SFM,">singlefirsthalfmax.html");
open(SFMTSV,">singlefirsthalfmax.tsv");
print SFM "
<pre>
<table border=1>
<tr><td>gene<td>alt_name \n";
print SFMTSV "gene\talt_name";
for($i=0;$i<4;$i++){
$da = $dirs[$i];
$infoa = $info[$i];
print SFM "<td> $da $infoa<td>first_pos,halfmax_pos,max_pos,maxvalue";
print SFMTSV "\t$da\_$infoa\_first_pos\t$da\_$infoa\_halfmax_pos\t$da\_$infoa\_max_pos\t$da\_$infoa\_maxvalue";}

print SFM "\n";
print SFMTSV "\n";

foreach $gene (sort keys %ghash){

$line = "<tr><td>$gene<td>$nice{$gene}";
$tsvline = "$gene";
for($i=0;$i<4;$i++){
$da = $dirs[$i];
$infoa = $info[$i];
if(exists $firstpos{$da}{$gene} && $firstpos{$da}{$gene}>0 ){
$line .= "<td><img height=240 src=\"pngs/$da.$gene.png\"><td>$firstpos{$da}{$gene},$halfmaxpos{$da}{$gene},$maxpos{$da}{$gene},$max{$da}{$gene} ";
$tsvline .= "\t$firstpos{$da}{$gene}\t$halfmaxpos{$da}{$gene}\t$maxpos{$da}{$gene}\t$max{$da}{$gene} ";
}
else{$line .= "<td><td>";
$tsvline .= "\t\t\t\t";
}
}

if($line =~ /img/){
print SFM $line;
print SFM "\n";
print SFMTSV $tsvline;
print SFMTSV "\n";
}


}

