Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use Getopt::Std;
- #use warnings;
- sub usage
- {
- print "\nUsage: $0 -fpbn\n-f <filename of dataset>\n-p <binary prediction column(s) ex. 1 or 2,3 &c.>\n-n <column number with names>\n-b <number of bags default 10>\n -t <filename of data to predict outcomes> \n\n";
- die;
- }
- sub init
- {
- getopts('f:p:n:b:t:', \%opts) or usage();
- if(!$opts{f}){ print "Need datafile!\n"; usage (); }
- if(!$opts{p}){ print "Need to specify what we are trying to predict!\n"; usage(); }
- else
- {
- @notdata = split(/,/, $opts{p});
- @topredict1 = split(/,/, $opts{p});
- foreach $tp (@topredict1){ push(@topredict, ($tp -1)); }
- }
- if($opts{b}){ $baggins = $opts{b}; }
- else { $baggins = 10; }
- if($opts{n}){ push(@notdata, $opts{n}); }
- }
- sub bagging
- {
- ($numbags, @bagdata) = @_;
- my $x;
- #Generate $x bags
- for($x = 0; $x< $numbags; $x++)
- {
- #Random data for each feature
- for($y = 0; $y < $numfeats; $y++)
- {
- #Random data
- for($z = 0; $z < ($#{$bagdata[$y]} + 1); $z++)
- {
- $randdata = int(rand($#{$bagdata[$y]} + 1));
- $bags[$x][$y][$z] = $bagdata[$y][$randdata];
- }
- }
- }
- }
- sub parsefile
- {
- my(@filer) = @_;
- my $w = 0;
- my $z = 0;
- my @datass = ();
- my $numfeats = 0;
- my $first = 0;
- foreach my $line (@filer)
- {
- my @stuff = split(/\s+/,$line);
- #make first line in file the feature names
- if($first == 0)
- {
- foreach $feats (@stuff)
- {
- $datfeat[$numfeats] = $feats;
- #print "$numfeats : $feats : $datfeat[$numfeats]\n";
- $numfeats++;
- $first++;
- }
- }
- else
- {
- foreach my $dat (@stuff)
- {
- $datass[$w][$z] = $dat;
- $w++;
- }
- $w = 0;
- $z++;
- }
- # print "DAT @{$datass[0]}\n";
- undef @stuff;
- }
- undef @filer;
- return @datass;
- }
- sub main
- {
- my $z = 0;
- my $x = 0;
- my $y = 0;
- init();
- open(FILE, "<", $opts{f}) or die;
- my @file = <FILE>;
- open(FILED, "<", $opts{t}) or die;
- my @predfile = <FILED>;
- my @datas = parsefile(@file);
- close(FILE);
- @testdata = parsefile(@predfile);
- #my @testdata = @$tempp;
- close(FILED);
- for($x = 0; $x < ($#datfeat + 1); $x++)
- {
- foreach $notd (@notdata)
- {
- if($x == ($notd - 1)){ $nope = 1; }
- }
- if(!$nope){ $realdata[$y] = $x; $y++;}
- $nope = 0;
- }
- foreach $predicting (@topredict)
- {
- for($y = 0; $y < $#{$datas[0]}; $y++)
- {
- $namelookup{$datfeat[$predicting]}{$testdata[0][$y]} = $testdata[$predicting][$y];
- }
- }
- #bagging($baggins, @datas);
- %empty = maketreesandstuff(@datas);
- predict(%empty);
- }
- sub maketreesandstuff
- {
- my $x;
- my $y;
- my %predsandprobs = ();
- my %tablelookup = ();
- my $sims = 100000;
- (@isdata) = @_;
- #count values
- foreach $foreals (@realdata)
- {
- foreach $datum (@{$isdata[$foreals]})
- {
- $countdatacula{$datfeat[$foreals]}{$datum} = $countdatacula{$datfeat[$foreals]}{$datum} + 1;
- }
- }
- foreach $foreals (@realdata)
- {
- foreach $predicting (@topredict)
- {
- for($y = 0; $y < $sims; $y++)
- {
- $rando = int(rand($#{$isdata[$forreals]} + 1));
- $yes = int(rand(2));
- if($isdata[$predicting][$rando] == $yes)
- { $predsandprobs{$datfeat[$predicting]}{$datfeat[$foreals]}{$isdata[$foreals][$rando]}{guess}++; }
- if($isdata[$predicting][$rando] == 1)
- {
- $predsandprobs{ $datfeat[$predicting] }{ $datfeat[$foreals] }{ $isdata[$foreals][$rando] }{truthone}++;
- foreach $realz (@realdata)
- {
- if($realz != $foreals)
- {
- $predsandprobs{ $datfeat[$predicting] }{ $datfeat[$foreals] }{ $isdata[$foreals][$rando] }{mutual}{$datfeat[$realz]}{$isdata[$realz][$rando]}{truthone}++;
- }
- }
- }
- elsif($isdata[$predicting][$rando] == 0)
- {
- $predsandprobs{ $datfeat[$predicting] }{ $datfeat[$foreals] }{ $isdata[$foreals][$rando] }{truthzero}++;
- foreach $realz (@realdata)
- {
- if($realz != $foreals)
- {
- $predsandprobs{ $datfeat[$predicting] }{ $datfeat[$foreals] }{ $isdata[$foreals][$rando] }{mutual}{$datfeat[$realz]}{$isdata[$realz][$rando]}{truthzero}++;
- # print "D: $isdata[0][$rando] : $datfeat[$foreals] : $isdata[$foreals][$rando] : $datfeat[$realz] : $isdata[$realz][$rando] : $predsandprobs{ $datfeat[$predicting] }{ $datfeat[$foreals] }{ $isdata[$foreals][$rando] }{mutual}{$datfeat[$realz]}{$isdata[$realz][$rando]}{truthzero}\n";
- }
- }
- }
- }
- }
- }
- #calculate probabilities
- #key1 = Predictor
- #key2 = feature
- #key3 = feature value
- foreach $key1 (keys %predsandprobs)
- {
- foreach $key2 (keys %{$predsandprobs{$key1}})
- {
- foreach $key3 (keys %{$predsandprobs{$key1}{$key2}})
- {
- #for the binary predictor probabilities
- if($predsandprobs{$key1}{$key2}{$key3}{truthzero})
- { $truthierz = $predsandprobs{$key1}{$key2}{$key3}{truthzero} / ($predsandprobs{$key1}{$key2}{$key3}{truthzero} + $predsandprobs{$key1}{$key2}{$key3}{truthone}); }
- else { $truthierz = 0.00000001; }
- if($predsandprobs{$key1}{$key2}{$key3}{truthone})
- { $truthiero = $predsandprobs{$key1}{$key2}{$key3}{truthone} / ($predsandprobs{$key1}{$key2}{$key3}{truthzero} + $predsandprobs{$key1}{$key2}{$key3}{truthone}); }
- else { $truthiero = 0.00000001; }
- #probability of 0
- $predsandprobs{$key1}{$key2}{$key3}{truthierz} = $truthierz;
- $tablelookup{$key1}{$key2}{$key3}{truthierz} = $truthierz;
- #probability of 1
- $predsandprobs{$key1}{$key2}{$key3}{truthiero} = $truthiero;
- $tablelookup{$key1}{$key2}{$key3}{truthiero} = $truthiero;
- #$predsandprobs{$key1}{$key2}{$key3}{prob} = $predsandprobs{$key1}{$key2}{$key3}{truth} / $sims;
- #print "$key1 : $key2 : $key3 : $predsandprobs{$key1}{$key2}{$key3}{truthz} $predsandprobs{$key1}{$key2}{$key3}{trutho} : $truthierz : $truthiero\n";
- foreach $key4 (keys %{$predsandprobs{$key1}{$key2}{$key3}{mutual}})
- {
- foreach $key5 (keys %{$predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}})
- {
- if($predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthone})
- {
- $truthiermo = $predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthone} / ($predsandprobs{$key1}{$key2}{$key3}{$key4}{mutual}{$key4}{$key5}{truthzero} + $predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthone});
- }
- else{ $truthiermo = 0.000001; }
- if($predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthzero})
- {
- $truthiermz = $predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthzero} / ($predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthzero} + $predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthone});
- }
- else { $truthiermz = 0.0000001; }
- # print "Truth: $key1 $key2 $key3 : $truthierz : $truthiero\n";
- $predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthierz} = $truthiermz;
- $tablelookup{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthierz} = $truthiermz;
- # probability of 1
- $predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthiero} = $truthiermo;
- $tablelookup{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthiero} = $truthiermo;
- # print "TruthM: $key1 $key2 $key3 $key4 $key5: 0 $predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthierz} : 1 $predsandprobs{$key1}{$key2}{$key3}{mutual}{$key4}{$key5}{truthiero}\n";
- } #foreach key5
- }#foreach key4
- print "Truth: $key1 $key2 $key3 : $truthierz : $truthiero :0 $predsandprobs{$key1}{$key2}{$key3}{truthzero} : 1 $predsandprobs{$key1}{$key2}{$key3}{truthone}\n";
- }#foreach key3
- #print "$key1 : $key2 : $featsprobs{$key1}{$key2}{tots}\n";
- undef $truthierz; undef $truthiero; undef $truthiermo; undef $truthiermz;
- }
- }
- return %tablelookup;
- }
- sub predict()
- {
- my(%whatweneed) = @_;
- my @doit = @testdata;
- my %bestest = ();
- my $x = 0;
- my $y = 0;
- my $onsie = 0;
- #
- #testdata is global from test file
- #realdata is the data with the non-feature data removed
- #p1 is for probability of effect
- #p0 is for probability of no effect
- #
- # for each feature value
- for($y = 0; $y < ($#{$doit[0]} + 1); $y++)
- {
- #for each Predictor
- foreach $key1 (keys %whatweneed) #loop Predictors
- {
- $bestest{$key1}{valone} = 0;
- $bestest{$key1}{val0} = 0;
- $onsie = 0;
- for($x = $realdata[0]; $x < $realdata[$#realdata]; $x++)
- {
- if($key1)
- {
- #print "DHDH $x : $datfeat[$x]\n";
- #print "See one: $whatweneed{$key1}{$doit[$x]}{$doit[$x][$y]}{truthiero} : $bestest{$key1}{valone}\n";
- # whatweneed{predictor}{feature}{feature value} = Rank
- $randtest = int(rand(($realdata[$#realdata] - $realdata[0]) + 1) + $realdata[0]);
- #print "$randtest\n";
- $bestest{$key1}{$onsie}{valone} = $whatweneed{$key1}{$datfeat[$x]}{$doit[$x][$y]}{truthiero};
- $bestest{$key1}{$onsie}{namevalone} = $doit[$x][$y];
- $bestest{$key1}{$onsie}{nameone} = $datfeat[$x];
- $bestest{$key1}{$onsie}{randvalone} = $whatweneed{$key1}{$datfeat[$randtest]}{$doit[$randtest][$y]}{truthiero};
- $bestest{$key1}{$onsie}{randnamevalone} = $doit[$randtest][$y];
- $bestest{$key1}{$onsie}{randnameone} = $datfeat[$randtest];
- $randtest = int(rand(($realdata[$#realdata] - $realdata[0]) + 1) + $realdata[0]);
- #print "See zero: $whatweneed{$key1}{$doit[$x]}{$doit[$x][$y]}{truthierz} : $bestest{$key1}{val0}\n";
- $bestest{$key1}{$onsie}{val0} = $whatweneed{$key1}{$datfeat[$x]}{$doit[$x][$y]}{truthierz};
- $bestest{$key1}{$onsie}{nameval0} = $doit[$x][$y];
- $bestest{$key1}{$onsie}{name0} = $datfeat[$x];
- $bestest{$key1}{$onsie}{randval0} = $whatweneed{$key1}{$datfeat[$randtest]}{$doit[$randtest][$y]}{truthierz};
- $bestest{$key1}{$onsie}{randnameval0} = $doit[$randtest][$y];
- $bestest{$key1}{$onsie}{randname0} = $datfeat[$randtest];
- $onsie++;
- }
- }#forx
- #compare probabilities
- #bestest{val0} = rank
- #key1 = UV, CD
- #bestest{$key1}{namevalX} & doit[x][y] = value of feature
- #bestest{name} & datfeat[x] = name of feature
- #whatweneed{predictor}{Feature}{p1}{rank}
- undef @temp; $j = 0;
- #
- #sort and rearrange probabilities by value
- #
- foreach $keysort (sort {$bestest{$key1}{$b}{valone} <=> $bestest{$key1}{$a}{valone}} keys %{$bestest{$key1}})
- {
- if($bestest{$key1}{$keysort}{valone})
- {
- $temp[$j] = $keysort;
- $tempr[$keysort] = $j;
- }
- else { $tempr[$j] = $j; $temp[$j] = $j; }
- # print "KEY: tr: $j : $keysort : $bestest{$key1}{$keysort}{valone} \n";
- $j++;
- }
- for($q = 0; $q < $onsie; $q++)
- {
- # print "TEMP: $q : $temp[$q] : $bestest{$key1}{$q}{valone} : $bestest{$key1}{$temp[$q]}{valone}\n";
- $tempval = $bestest{$key1}{$temp[$q]}{valone};
- $tempnameval = $bestest{$key1}{$temp[$q]}{namevalone};
- $tempname = $bestest{$key1}{$temp[$q]}{nameone};
- $bestest{$key1}{$temp[$q]}{valone} = $bestest{$key1}{$q}{valone};
- $bestest{$key1}{$temp[$q]}{namevalone} = $bestest{$key1}{$q}{namevalone};
- $bestest{$key1}{$temp[$q]}{nameone} = $bestest{$key1}{$q}{nameone};
- $bestest{$key1}{$q}{valone} = $tempval;
- $bestest{$key1}{$q}{namevalone} = $tempnameval;
- $bestest{$key1}{$q}{nameone} = $tempname;
- $tpt = $temp[$q];
- $tpz = $tempr[$q];
- $temp[$q] = $q;
- $temp[$tpz] = $tt;
- $tempr[$q] = $q;
- $tempr[$tpt] = $tz;
- # print "QQQ1: $q : $bestest{$key1}{$q}{valone}\n";
- }
- undef @temp; undef @tempr; $j = 0;
- foreach $keysorts (sort {$bestest{$key1}{$b}{val0} <=> $bestest{$key1}{$a}{val0}} keys %{$bestest{$key1}})
- {
- if($bestest{$key1}{$keysorts}{val0})
- {
- $tempz[$j] = $keysorts;
- $temprz[$keysorts] = $j;
- }
- else { $temprz[$j] = $j; $tempz[$j] = $j; }
- # print "KEY: $j : $keysorts : $bestest{$key1}{$keysorts}{val0} \n";
- $j++;
- }
- for($q = 0; $q < $onsie; $q++)
- {
- # print "TEMP: $q : $temp[$q] : $bestest{$key1}{$q}{valone} : $bestest{$key1}{$temp[$q]}{valone}\n";
- $tempval = $bestest{$key1}{$tempz[$q]}{val0};
- $tempnameval = $bestest{$key1}{$tempz[$q]}{nameval0};
- $tempname = $bestest{$key1}{$tempz[$q]}{name0};
- $bestest{$key1}{$tempz[$q]}{val0} = $bestest{$key1}{$q}{val0};
- $bestest{$key1}{$tempz[$q]}{nameval0} = $bestest{$key1}{$q}{nameval0};
- $bestest{$key1}{$tempz[$q]}{name0} = $bestest{$key1}{$q}{name0};
- $bestest{$key1}{$q}{val0} = $tempval;
- $bestest{$key1}{$q}{nameval0} = $tempnameval;
- $bestest{$key1}{$q}{name0} = $tempname;
- # print "iTEMP: $tempz[$q] $temprz[$tempz[$q]] $tempz[$temprz[$q]] $q\n";
- $tt = $tempz[$q];
- $tz = $temprz[$q];
- $tempz[$q] = $q;
- $tempz[$tz] = $tt;
- $temprz[$q] = $q;
- $temprz[$tt] = $tz;
- # $tt = $temprz[$tempz[$q]];
- # print "iTEMP: $tempz[$q] $temprz[$tempz[$q]] $tempz[$temprz[$q]] $q\n";
- # print "QQQ2: $doit[0][$y] : $q : $bestest{$key1}{$q}{val0} \n";
- }
- undef @temp; undef @tempr;
- } #foreach $key1
- foreach $key1 (keys %bestest)
- {
- # whatweneed{predictor}{feature}
- $probone = $bestest{$key1}{0}{valone};
- $prob0 = $bestest{$key1}{0}{val0};
- $randprobone = $bestest{$key1}{0}{randvalone};
- $randprob0 = $bestest{$key1}{0}{randval0};
- $matchestotal{$key1}++;
- #print "1: $probone $bestest{$key1}{1}{valone} $bestest{$key1}{2}{valone} 2: $prob0 $bestest{$key1}{1}{val0} $bestest{$key1}{2}{val0}\n";
- #print "HELP! $key1 : k $bestest{$key1}{val1} k: $whatweneed{$key1}{$bestest{$key1}{name1}}{p1}{$bestest{$key1}{val1}}{prob} \n";
- if($probone >= $prob0)
- {
- # print "1\n";
- if($bestest{$key1}{1}{valone} >= $bestest{$key1}{0}{val0})
- {
- if($namelookup{$key1}{$doit[0][$y]} == 1){ $matches{$key1}++; $matchesnum{$key1}{one}++;}
- #print "prediction11 : $key1 : $doit[0][$y] : Name $namelookup{$key1}{$doit[0][$y]} : Feat $bestest{$key1}{nameone} : Value $bestest{$key1}{namevalone} : Prob $bestest{$key1}{valone} : $prob0\n";
- }
- elsif($bestest{$key1}{2}{valone} >= $bestest{$key1}{2}{val0})
- {
- if($namelookup{$key1}{$doit[0][$y]} == 1){ $matches{$key1}++; $matchesnum{$key1}{one}++;}
- #print "prediction12 : $key1 : $doit[0][$y] : Feat $bestest{$key1}{nameone} : Value $bestest{$key1}{namevalone} : Prob $bestest{$key1}{valone} : $prob0\n";
- }
- else{
- if($namelookup{$key1}{$doit[0][$y]} == 0){ $matches{$key1}++; $matchesnum{$key1}{zero}++;}
- # print "prediction1_0 : $key1 : $doit[0][$y] : Feat $bestest{$key1}{name0} : Value $bestest{$key1}{nameval0} : Prob $bestest{$key1}{val0} : $probone\n";
- }
- }
- elsif($prob0 > $probone)
- {
- if($bestest{$key1}{1}{val0} >= $bestest{$key1}{0}{valone})
- {
- if($namelookup{$key1}{$doit[0][$y]} == 0){ $matches{$key1}++; $matchesnum{$key1}{zero}++; }
- # print "prediction01 : $key1 : $doit[0][$y] : Feat $bestest{$key1}{name0} : Value $bestest{$key1}{nameval0} : Prob $bestest{$key1}{1}{val0} : $probone\n";
- }
- elsif($bestest{$key1}{2}{val0} >= $bestest{$key1}{2}{valone})
- {
- if($namelookup{$key1}{$doit[0][$y]} == 0){ $matches{$key1}++; $matchesnum{$key1}{zero}++; }
- # print "prediction02 : $key1 : $doit[0][$y] : Feat $bestest{$key1}{name0} : Value $bestest{$key1}{nameval0} : Prob $bestest{$key1}{2}{val0} : $probone\n";
- }
- else {
- if($namelookup{$key1}{$doit[0][$y]} == 1){ $matches{$key1}++; $matchesnum{$key1}{one}++; }
- #print "prediction0_1 : $key1 : $doit[0][$y] : Feat $bestest{$key1}{nameone} : Value $bestest{$key1}{namevalone} : Prob $bestest{$key1}{2}{valone} : $prob0\n";
- }
- }
- # if($randprobone >= $randprob0)
- # { print "Rand prediction1 : $key1 : $doit[0][$y] : Feat $bestest{$key1}{randnameone} : Value $bestest{$key1}{randnamevalone} : Prob $bestest{$key1}{randvalone} : $randprob0\n";}
- # elsif($randprob0 > $randprobone)
- # {print "Rand prediction0 : $key1 : $doit[0][$y] : Feat $bestest{$key1}{randname0} : Value $bestest{$key1}{randnameval0} : Prob $bestest{$key1}{randval0} : $randprobone\n";}
- if($namelookup{$key1}{$doit[0][$y]} == 1){ $matchesnum{$key1}{onetotal}++; }
- elsif($namelookup{$key1}{$doit[0][$y]} == 0){ $matchesnum{$key1}{zerototal}++; }
- } #foreach bestest
- undef %bestest;
- } #for
- foreach $key5 (keys %matches)
- {
- if(!$matchesnum{$key5}{zero}){ $matchesnum{$key5}{zero} = 0; }
- if(!$matchesnum{$key5}{one}){ $matchesnum{$key5}{one} = 0; }
- $valzero = $matchesnum{$key5}{zero} / $matchesnum{$key5}{zerototal};
- $valone = $matchesnum{$key5}{one} / $matchesnum{$key5}{onetotal};
- $valtotal = ($matches{$key5}) / $matchestotal{$key5};
- #print "$matches{$key5} : $matchestotal{$key5} \n";
- print "---$key5--- \nOne: $valone\nZero: $valzero\nTotal: $valtotal\n";
- }
- }
- main();
Add Comment
Please, Sign In to add comment