spider.php 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #!/usr/bin/env php
  2. <?php
  3. /*
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  16. */
  17. /**
  18. * \file dev/tools/spider.php
  19. * \brief Script to spider Dolibarr app.
  20. *
  21. * To use it:
  22. * - Disable module "bookmark"
  23. * - Exclude param optioncss, token, sortfield, sortorder
  24. */
  25. $crawledLinks=array();
  26. const MAX_DEPTH=2;
  27. /**
  28. * @param string $url URL
  29. * @param string $depth Depth
  30. * @return string String
  31. */
  32. function followLink($url, $depth = 0)
  33. {
  34. global $crawledLinks;
  35. $crawling=array();
  36. if ($depth>MAX_DEPTH) {
  37. echo "<div style='color:red;'>The Crawler is giving up!</div>";
  38. return;
  39. }
  40. $options=array(
  41. 'http'=>array(
  42. 'method'=>"GET",
  43. 'user-agent'=>"gfgBot/0.1\n"
  44. )
  45. );
  46. $context=stream_context_create($options);
  47. $doc=new DomDocument();
  48. @$doc->loadHTML(file_get_contents($url, false, $context));
  49. $links=$doc->getElementsByTagName('a');
  50. $pageTitle=getDocTitle($doc, $url);
  51. $metaData=getDocMetaData($doc);
  52. foreach ($links as $i) {
  53. $link=$i->getAttribute('href');
  54. if (ignoreLink($link)) continue;
  55. $link=convertLink($url, $link);
  56. if (!in_array($link, $crawledLinks)) {
  57. $crawledLinks[]=$link;
  58. $crawling[]=$link;
  59. insertIntoDatabase($link, $pageTitle, $metaData, $depth);
  60. }
  61. }
  62. foreach ($crawling as $crawlURL)
  63. followLink($crawlURL, $depth+1);
  64. }
  65. /**
  66. * @param string $site Site
  67. * @param string $path Path
  68. * @return string String
  69. */
  70. function convertLink($site, $path)
  71. {
  72. if (substr_compare($path, "//", 0, 2)==0)
  73. return parse_url($site)['scheme'].$path;
  74. elseif (substr_compare($path, "http://", 0, 7)==0 or
  75. substr_compare($path, "https://", 0, 8)==0 or
  76. substr_compare($path, "www.", 0, 4)==0)
  77. return $path;
  78. else return $site.'/'.$path;
  79. }
  80. /**
  81. * @param string $url URL
  82. * @return boolean
  83. */
  84. function ignoreLink($url)
  85. {
  86. return $url[0]=="#" or substr($url, 0, 11) == "javascript:";
  87. }
  88. /**
  89. * @param string $link URL
  90. * @param string $title Title
  91. * @param string $metaData Array
  92. * @param int $depth Depth
  93. * @return void
  94. */
  95. function insertIntoDatabase($link, $title, &$metaData, $depth)
  96. {
  97. //global $crawledLinks;
  98. echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}<br/><br/><br/>";
  99. //²$crawledLinks[]=$link;
  100. }
  101. /**
  102. * @param string $doc Doc
  103. * @param string $url URL
  104. * @return string URL/Title
  105. */
  106. function getDocTitle(&$doc, $url)
  107. {
  108. $titleNodes=$doc->getElementsByTagName('title');
  109. if (count($titleNodes)==0 or !isset($titleNodes[0]->nodeValue))
  110. return $url;
  111. $title=str_replace('', '\n', $titleNodes[0]->nodeValue);
  112. return (strlen($title)<1)?$url:$title;
  113. }
  114. /**
  115. * @param string $doc Doc
  116. * @return array Array
  117. */
  118. function getDocMetaData(&$doc)
  119. {
  120. $metaData=array();
  121. $metaNodes=$doc->getElementsByTagName('meta');
  122. foreach ($metaNodes as $node)
  123. $metaData[$node->getAttribute("name")] = $node->getAttribute("content");
  124. if (!isset($metaData['description']))
  125. $metaData['description']='No Description Available';
  126. if (!isset($metaData['keywords'])) $metaData['keywords']='';
  127. return array(
  128. 'keywords'=>str_replace('', '\n', $metaData['keywords']),
  129. 'description'=>str_replace('', '\n', $metaData['description'])
  130. );
  131. }
  132. followLink("http://localhost/dolibarr_dev/htdocs");