Google Love.pl

Discussion in 'Gaming' started by Tool, Mar 14, 2006.

  1. Tool

    Tool Active Member

    Age:
    34
    Posts:
    31
    Likes Received:
    0
    Joined:
    Mar 9, 2006
    This script will grind through your web site's "access.log" file (which must be in the "combined" log format). It'll pick out the top 100 Google searches found in the referer field, re-run those searches, and determine which ones are giving your website all the linky Google love -- in other words, the searches that your site 'wins' on.

    Im sure some of you know to work perl. If not then... well, I cant be bothered writing a tutorial. Google it or something.

    Code:
    #!/usr/bin/perl 
      
     my $site = shift @ARGV; 
     my $google_api_key = shift @ARGV; 
      
     if (!$google_api_key) { 
       die q{ 
     goog-love.pl - find out where your site's google juice comes from 
      
     This script will grind through your web site's "access.log" file (which must be 
     in the "combined" log format).   It'll pick out the top 100 Google searches 
     found in the referer field, re-run those searches, and determine which ones are 
     giving your website all the linky Google love -- in other words, the searches 
     that your site 'wins' on. 
      
     The output is in plain text and a chunk of HTML. 
      
     usage: 
      
       goog-love.pl sitehost google-api-key < access.log > out.html 
      
     e.g. 
      
       cat /var/www/logs/taint.org.* | \ 
         goog-love.pl taint.org kjkdf909403g0fg0f90dfgdf0g09gfd | \ 
         tee out.html 
      
      
     NOTE: this script requires the "SOAP::Lite" module be installed.  Install 
     it using "apt-get install libsoap-lite-perl" or "cpan SOAP::Lite".  It also 
     requires a Google API key. 
      
     }; 
     } 
      
     # version: Feb  7 2006 jm  
      
     # --------------------------------------------------------------------------- 
      
     use warnings; 
     use strict; 
      
     use SOAP::Lite; 
     use CGI;        # to decode the CGI-formatted query string 
      
     # cf [url]http://www.oreillynet.com/pub/a/network/excerpt/ggl_hcks/index.html[/url] 
     # for the general Google-API idea.  Net::Google doesn't work anymore it 
     # seems :( 
      
     my $google = SOAP::Lite->service("http://api.google.com/GoogleSearch.wsdl"); 
      
     # $ua = LWP::UserAgent->new; 
     # $ua->env_proxy; 
      
     my %queries; 
     $| = 1; 
      
     readlogs(); 
     google(); 
     summarise(); 
     exit; 
      
     # --------------------------------------------------------------------------- 
      
    # "http://www.google.it/search?q=%2B%22Bob+Menschel%22+%2Bspamassassin&hl=it&lr=&client=firefox-a&rls=org.mozilla:en-US:official&start=10&sa=N" 
    # "http://www.google.com/search?q=%22History+of+CD-ROM%22&hl=en&lr=&rls=GGLD,GGLD:2005-09,GGLD:en&start=50&sa=N" 
     # etc. 
     # 
     sub readlogs 
     { 
       while (<>) { 
         /^\d\S+ \S+ \S+ \S+ \S+ "GET (.*?) HTTP\/1.0" \S+ \S+ "(.+?)" "/ 
                               or next; 
         my $path = $1; 
         my $referer = $2; 
         my $query; 
          
         if ($referer =~ /google/i && $referer =~ /[\?\&]q=([^\&]+)(?:\&|$)/) { 
           $query = $1; 
         } 
      
         if ($query) { 
           add_query($query, $path); 
         } 
       } 
     } 
      
     sub add_query 
     { 
       my ($query, $path) = @_; 
      
       $queries{$query} ||= { 
         paths => { }, 
         count => 0 
       }; 
      
       $queries{$query}{paths}{$path} = 1; 
       $queries{$query}{count}++; 
     } 
      
     sub google 
     { 
       my $maxqueries = 100; 
       my $qcount; 
      
       print "progress: found ".(scalar keys %queries)." searches.  ". 
             "checking the top $maxqueries against google...\n"; 
       foreach my $query (sort { 
           $queries{$b}{count} <=> $queries{$a}{count} 
         } keys %queries) 
       { 
         $qcount++; 
         last if ($qcount > $maxqueries); 
      
         my $cgi = CGI->new('q='.$query); 
         my $qstring = $cgi->param('q'); 
      
         my $count = $queries{$query}{count}; 
      
         $queries{$query}{rawqstring} = $query; 
         $queries{$query}{qstring} = $qstring; 
         $queries{$query}{posn} = 999999;    # ie. "crappy" 
      
         # print "googling: [$qstring] ($count hits)\n"; 
      
         my $results = $google->doGoogleSearch( 
             $google_api_key,  
             $qstring, 
             0, 10, "false", "",  "false", "", "latin1", "latin1" 
         ); 
         @{$results->{'resultElements'}} or next; 
      
         my $posn = 0; 
         foreach my $result (@{$results->{'resultElements'}}) { 
           $posn++; 
           # print $result->{URL}, $result->{snippet},"\n"; 
           if ($result->{URL} =~ /${site}/i) { 
             $queries{$query}{posn} = $posn; 
             $queries{$query}{url} = $result->{URL}; 
             print "progress: \@$qcount - #$posn for [$qstring]: $result->{URL}\n"; 
             last; 
           } 
         } 
       } 
     } 
      
     sub summarise 
     { 
       my $otext = ''; 
       my $ohtml = ''; 
      
       foreach my $query (sort { 
           $queries{$a}{posn} <=> $queries{$b}{posn} or 
           length($queries{$a}{qstring}) <=> length($queries{$b}{qstring}) 
         } grep { 
           $queries{$_}{posn} && ($queries{$_}{posn} < 999999) && 
           $queries{$_}{qstring} && $queries{$_}{url} 
         } keys %queries) 
       { 
         my $url = $queries{$query}{url}; 
         my $posn = $queries{$query}{posn}; 
         my $qstring = $queries{$query}{qstring}; 
         my $rawqstring = $queries{$query}{rawqstring}; 
         my $qhref = 'http://www.google.com/search?num=10&q='.$rawqstring; 
      
         $otext .= "#$posn for [$qstring]: $url\n"; 
      
         $ohtml .= qq{ 
          [*]#$posn for [url="$qhref"]$qstring[/url]: [url="$url"]$url[/url] 
         }; 
       } 
      
       print qq{ 
       <!-- 
      
       Final, sorted list, as text: 
      
     $otext 
      
       As HTML: --> 
       [list]$ohtml[/list] 
      
       }; 
     }
     

Share This Page