Script PHP per scaricare gli ultimi link da GWT

Era da un po’ di tempo che volevo fare qualcosa di interessante e che interagisse automaticamente con gli Strumenti per Webmaster di Google. In particolar modo ero interessato ai latest link, ovvero gli ultimi link trovati dallo spider di Google.

ghitub

In rete ho trovato questo codice php-webmaster-tools-downloads sviluppato da eyecatchup che però, di default non permette di scaricare i latest link ma solamente i backlink esterni.

Per scaricare i latest link bastano poche linee di codice in più. Ecco qui i file modificati e pronti all’uso. Per ulteriori istruzioni vi rimando alla pagina ufficiale php-webmaster-tools-downloads.

gwtdata.php

<?php
/**
* PHP class for downloading CSV files from Google Webmaster Tools.
*
* This class does NOT require the Zend gdata package be installed
* in order to run.
*
* Copyright 2012 eyecatchUp UG. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* @author: Stephan Schmitz <eyecatchup@gmail.com>
* @link: https://code.google.com/p/php-webmaster-tools-downloads/
* @link: https://github.com/eyecatchup/php-webmaster-tools-downloads/
*/

class GWTdata
{
const HOST = "https://www.google.com";
const SERVICEURI = "/webmasters/tools/";

public $_language, $_tables, $_daterange, $_downloaded, $_skipped;
private $_auth, $_logged_in;

public function __construct()
{
$this->_auth = false;
$this->_logged_in = false;
$this->_language = "en";
$this->_daterange = array("","");
$this->_tables = array("TOP_PAGES", "TOP_QUERIES",
"CRAWL_ERRORS", "CONTENT_ERRORS", "CONTENT_KEYWORDS",
"INTERNAL_LINKS", "EXTERNAL_LINKS", "BACKLINKS_LATEST", "SOCIAL_ACTIVITY"
);
$this->_errTablesSort = array(0 => "http",
1 => "not-found", 2 => "restricted-by-robotsTxt",
3 => "unreachable", 4 => "timeout", 5 => "not-followed",
"kAppErrorSoft-404s" => "soft404", "sitemap" => "in-sitemaps"
);
$this->_errTablesType = array(0 => "web-crawl-errors",
1 => "mobile-wml-xhtml-errors", 2 => "mobile-chtml-errors",
3 => "mobile-operator-errors", 4 => "news-crawl-errors"
);
$this->_downloaded = array();
$this->_skipped = array();
}

/**
* Sets content language.
*
* @param $str String Valid ISO 639-1 language code, supported by Google.
*/
public function SetLanguage($str)
{
$this->_language = $str;
}

/**
* Sets features that should be downloaded.
*
* @param $arr Array Valid array values are:
* "TOP_PAGES", "TOP_QUERIES", "CRAWL_ERRORS", "CONTENT_ERRORS",
* "CONTENT_KEYWORDS", "INTERNAL_LINKS", "EXTERNAL_LINKS",
* "SOCIAL_ACTIVITY".
*/
public function SetTables($arr)
{
if(is_array($arr) && !empty($arr) && sizeof($arr) <= 2) {
$valid = array("TOP_PAGES","TOP_QUERIES","CRAWL_ERRORS","CONTENT_ERRORS",
"CONTENT_KEYWORDS","INTERNAL_LINKS","EXTERNAL_LINKS","BACKLINKS_LATEST","SOCIAL_ACTIVITY");
$this->_tables = array();
for($i=0; $i < sizeof($arr); $i++) {
if(in_array($arr[$i], $valid)) {
array_push($this->_tables, $arr[$i]);
} else { throw new Exception("Invalid argument given."); }
}
} else { throw new Exception("Invalid argument given."); }
}

/**
* Sets daterange for download data.
*
* @param $arr Array Array containing two ISO 8601 formatted date strings.
*/
public function SetDaterange($arr)
{
if(is_array($arr) && !empty($arr) && sizeof($arr) == 2) {
if(self::IsISO8601($arr[0]) === true &&
self::IsISO8601($arr[1]) === true) {
$this->_daterange = array(str_replace("-", "", $arr[0]),
str_replace("-", "", $arr[1]));
return true;
} else { throw new Exception("Invalid argument given."); }
} else { throw new Exception("Invalid argument given."); }
}

/**
* Returns array of downloaded filenames.
*
* @return Array Array of filenames that have been written to disk.
*/
public function GetDownloadedFiles()
{
return $this->_downloaded;
}

/**
* Returns array of downloaded filenames.
*
* @return Array Array of filenames that have been written to disk.
*/
public function GetSkippedFiles()
{
return $this->_skipped;
}

/**
* Checks if client has logged into their Google account yet.
*
* @return Boolean Returns true if logged in, or false if not.
*/
private function IsLoggedIn()
{
return $this->_logged_in;
}

/**
* Attempts to log into the specified Google account.
*
* @param $email String User's Google email address.
* @param $pwd String Password for Google account.
* @return Boolean Returns true when Authentication was successful,
* else false.
*/
public function LogIn($email, $pwd)
{
$url = self::HOST . "/accounts/ClientLogin";
$postRequest = array(
'accountType' => 'HOSTED_OR_GOOGLE',
'Email' => $email,
'Passwd' => $pwd,
'service' => "sitemaps",
'source' => "Google-WMTdownloadscript-0.1-php"
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $postRequest);
$output = curl_exec($ch);
$info = curl_getinfo($ch);
curl_close($ch);
if($info['http_code'] == 200) {
preg_match('/Auth=(.*)/', $output, $match);
if(isset($match[1])) {
$this->_auth = $match[1];
$this->_logged_in = true;
return true;
} else { return false; }
} else { return false; }
}

/**
* Attempts authenticated GET Request.
*
* @param $url String URL for the GET request.
* @return Mixed Curl result as String,
* or false (Boolean) when Authentication fails.
*/
public function GetData($url)
{
if(self::IsLoggedIn() === true) {
$url = self::HOST . $url;
$head = array("Authorization: GoogleLogin auth=".$this->_auth,
"GData-Version: 2");
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_ENCODING, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, $head);
$result = curl_exec($ch);
$info = curl_getinfo($ch);
curl_close($ch);
echo "<hr>$url<br><br>$result<hr>";
return ($info['http_code']!=200) ? false : $result;
} else { return false; }
}

/**
* Gets all available sites from Google Webmaster Tools account.
*
* @return Mixed Array with all site URLs registered in GWT account,
* or false (Boolean) if request failed.
*/
public function GetSites()
{
if(self::IsLoggedIn() === true) {
$feed = self::GetData(self::SERVICEURI."feeds/sites/");
if($feed !== false) {
$sites = array();
$doc = new DOMDocument();
$doc->loadXML($feed);
foreach ($doc->getElementsByTagName('entry') as $node) {
array_push($sites,
$node->getElementsByTagName('title')->item(0)->nodeValue);
}
return $sites;
} else { return false; }
} else { return false; }
}

/**
* Gets the download links for an available site
* from the Google Webmaster Tools account.
*
* @param $url String Site URL registered in GWT.
* @return Mixed Array with keys TOP_PAGES and TOP_QUERIES,
* or false (Boolean) when Authentication fails.
*/
public function GetDownloadUrls($url)
{
if(self::IsLoggedIn() === true) {
$_url = sprintf(self::SERVICEURI."downloads-list?hl=%s&siteUrl=%s",
$this->_language,
urlencode($url));
$downloadList = self::GetData($_url);
return json_decode($downloadList, true);
} else { return false; }
}

/**
* Downloads the file based on the given URL.
*
* @param $site String Site URL available in GWT Account.
* @param $savepath String Optional path to save CSV to (no trailing slash!).
*/
public function DownloadCSV($site, $savepath=".")
{
if(self::IsLoggedIn() === true) {
$downloadUrls = self::GetDownloadUrls($site);
$filename = parse_url($site, PHP_URL_HOST) ."-". date("Ymd-His");
$tables = $this->_tables;
foreach($tables as $table) {
if($table=="CRAWL_ERRORS") {
self::DownloadCSV_CrawlErrors($site, $savepath);
}
elseif($table=="CONTENT_ERRORS") {
self::DownloadCSV_XTRA($site, $savepath,
"html-suggestions", "\)", "CONTENT_ERRORS", "content-problems-dl");
}
elseif($table=="CONTENT_KEYWORDS") {
self::DownloadCSV_XTRA($site, $savepath,
"keywords", "\)", "CONTENT_KEYWORDS", "content-words-dl");
}
elseif($table=="INTERNAL_LINKS") {
self::DownloadCSV_XTRA($site, $savepath,
"internal-links", "\)", "INTERNAL_LINKS", "internal-links-dl");
}
elseif($table=="EXTERNAL_LINKS") {
self::DownloadCSV_XTRA($site, $savepath,
"external-links-domain", "\)", "EXTERNAL_LINKS", "external-links-domain-dl");
}
elseif($table=="BACKLINKS_LATEST") {
self::DownloadCSV_XTRA($site, $savepath,
"external-links-domain", "\)", "BACKLINKS_LATEST", "backlinks-latest-dl");
}
elseif($table=="SOCIAL_ACTIVITY") {
self::DownloadCSV_XTRA($site, $savepath,
"social-activity", "x26", "SOCIAL_ACTIVITY", "social-activity-dl");
}
else {
$finalName = "$savepath/$table-$filename.csv";
$finalUrl = $downloadUrls[$table] ."&prop=ALL&db=%s&de=%s&more=true";
$finalUrl = sprintf($finalUrl, $this->_daterange[0], $this->_daterange[1]);
self::SaveData($finalUrl,$finalName);
}
}
} else { return false; }
}

/**
* Downloads "unofficial" downloads based on the given URL.
*
* @param $site String Site URL available in GWT Account.
* @param $savepath String Optional path to save CSV to (no trailing slash!).
*/
public function DownloadCSV_XTRA($site, $savepath=".", $tokenUri, $tokenDelimiter, $filenamePrefix, $dlUri)
{
if(self::IsLoggedIn() === true) {
$uri = self::SERVICEURI . $tokenUri . "?hl=%s&siteUrl=%s";
$_uri = sprintf($uri, $this->_language, $site);
if ($filenamePrefix=="BACKLINKS_LATEST"){$number_match_array=2;}else{$number_match_array=0;}
$token = self::GetToken($_uri, $tokenDelimiter, $number_match_array);
$filename = parse_url($site, PHP_URL_HOST) ."-". date("Ymd-His");
$finalName = "$savepath/$filenamePrefix-$filename.csv";
$url = self::SERVICEURI . $dlUri . "?hl=%s&siteUrl=%s&security_token=%s&prop=ALL&db=%s&de=%s&more=true";
$_url = sprintf($url, $this->_language, $site, $token, $this->_daterange[0], $this->_daterange[1]);
self::SaveData($_url,$finalName);
} else { return false; }
}

/**
* Downloads the Crawl Errors file based on the given URL.
*
* @param $site String Site URL available in GWT Account.
* @param $savepath String Optional: Path to save CSV to (no trailing slash!).
* @param $separated Boolean Optional: If true, the method saves separated CSV files
* for each error type. Default: Merge errors in one file.
*/
public function DownloadCSV_CrawlErrors($site, $savepath=".", $separated=false)
{
if(self::IsLoggedIn() === true) {
$type_param = "we";
$filename = parse_url($site, PHP_URL_HOST) ."-". date("Ymd-His");
if($separated) {
foreach($this->_errTablesSort as $sortid => $sortname) {
foreach($this->_errTablesType as $typeid => $typename) {
if($typeid == 1) {
$type_param = "mx";
} else if($typeid == 2) {
$type_param = "mc";
} else {
$type_param = "we";
}
$uri = self::SERVICEURI."crawl-errors?hl=en&siteUrl=$site&tid=$type_param";
$token = self::GetToken($uri,"x26",0);
$finalName = "$savepath/CRAWL_ERRORS-$typename-$sortname-$filename.csv";
$url = self::SERVICEURI."crawl-errors-dl?hl=%s&siteUrl=%s&security_token=%s&type=%s&sort=%s";
$_url = sprintf($url, $this->_language, $site, $token, $typeid, $sortid);
self::SaveData($_url,$finalName);
}
}
}
else {
$uri = self::SERVICEURI."crawl-errors?hl=en&siteUrl=$site&tid=$type_param";
$token = self::GetToken($uri,"x26",0);
$finalName = "$savepath/CRAWL_ERRORS-$filename.csv";
$url = self::SERVICEURI."crawl-errors-dl?hl=%s&siteUrl=%s&security_token=%s&type=0";
$_url = sprintf($url, $this->_language, $site, $token);
self::SaveData($_url,$finalName);
}
} else { return false; }
}

/**
* Saves data to a CSV file based on the given URL.
*
* @param $finalUrl String CSV Download URI.
* @param $finalName String Filepointer to save location.
*/
private function SaveData($finalUrl, $finalName)
{
$data = self::GetData($finalUrl);
if(strlen($data) > 1 && file_put_contents($finalName, utf8_decode($data))) {
array_push($this->_downloaded, realpath($finalName));
return true;
} else {
array_push($this->_skipped, $finalName);
return false;
}
}

/**
* Regular Expression to find the Security Token for a download file.
*
* @param $uri String A Webmaster Tools Desktop Service URI.
* @param $delimiter String Trailing delimiter for the regex.
* @return String Returns a security token.
*/
private function GetToken($uri, $delimiter, $number_match_array)
{
$matches = array();
$tmp = self::GetData($uri);
preg_match_all("#46security_token(.*?)$delimiter#si", $tmp, $matches);
print_r($matches); //***ok ci siamo, guardo qui cosa stampa! non devo prendere il primo ma il 2o o 3o!
//se voglio i latest link devo prendere $matches[1][2]
return substr($matches[1][$number_match_array],3,-1);
}

/**
* Validates ISO 8601 date format.
*
* @param $str String Valid ISO 8601 date string (eg. 2012-01-01).
* @return Boolean Returns true if string has valid format, else false.
*/
private function IsISO8601($str)
{
$stamp = strtotime($str);
return (is_numeric($stamp) && checkdate(date('m', $stamp),
date('d', $stamp), date('Y', $stamp))) ? true : false;
}

}
?>

index.php

<?php
include 'gwtdata.php';
try {
$email = "username@gmail.com";
$password = "******";

# If hardcoded, don't forget trailing slash!
$website = "http://www.domain.com/";

# Valid values are "TOP_PAGES", "TOP_QUERIES", "CRAWL_ERRORS",
# "CONTENT_ERRORS", "CONTENT_KEYWORDS", "INTERNAL_LINKS",
# "EXTERNAL_LINKS" "BACKLINKS_LATEST" and "SOCIAL_ACTIVITY".
$tables = array("BACKLINKS_LATEST");

$gdata = new GWTdata();
if($gdata->LogIn($email, $password) === true)
{
$gdata->SetTables($tables);
$gdata->DownloadCSV($website);
}
} catch (Exception $e) {
die($e->getMessage());
}
?>

Partendo da qui, penso proprio che si possa costruire un ottimo tool per la gestione dei backlinks!

Scritto da in SEO



Lascia un commento