spider.php 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. #!/usr/bin/env php
  2. <?php
  3. /*
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 3 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  16. */
  17. /**
  18. * \file dev/tools/spider.php
  19. * \brief Script to spider Dolibarr app.
  20. *
  21. * To use it:
  22. * - Disable module "bookmark"
  23. * - Exclude param optioncss, token, sortfield, sortorder
  24. */
  25. $crawledLinks=array();
  26. const MAX_DEPTH=2;
  27. /**
  28. * @param string $url URL
  29. * @param string $depth Depth
  30. * @return string String
  31. */
  32. function followLink($url, $depth = 0)
  33. {
  34. global $crawledLinks;
  35. $crawling=array();
  36. if ($depth>MAX_DEPTH) {
  37. echo "<div style='color:red;'>The Crawler is giving up!</div>";
  38. return;
  39. }
  40. $options=array(
  41. 'http'=>array(
  42. 'method'=>"GET",
  43. 'user-agent'=>"gfgBot/0.1\n"
  44. )
  45. );
  46. $context=stream_context_create($options);
  47. $doc=new DomDocument();
  48. @$doc->loadHTML(file_get_contents($url, false, $context));
  49. $links=$doc->getElementsByTagName('a');
  50. $pageTitle=getDocTitle($doc, $url);
  51. $metaData=getDocMetaData($doc);
  52. foreach ($links as $i) {
  53. $link=$i->getAttribute('href');
  54. if (ignoreLink($link)) continue;
  55. $link=convertLink($url, $link);
  56. if (!in_array($link, $crawledLinks)) {
  57. $crawledLinks[]=$link;
  58. $crawling[]=$link;
  59. insertIntoDatabase($link, $pageTitle, $metaData, $depth);
  60. }
  61. }
  62. foreach ($crawling as $crawlURL)
  63. followLink($crawlURL, $depth+1);
  64. }
  65. /**
  66. * @param string $site Site
  67. * @param string $path Path
  68. * @return string String
  69. */
  70. function convertLink($site, $path)
  71. {
  72. if (substr_compare($path, "//", 0, 2)==0)
  73. return parse_url($site)['scheme'].$path;
  74. elseif (substr_compare($path, "http://", 0, 7)==0
  75. or substr_compare($path, "https://", 0, 8)==0
  76. or substr_compare($path, "www.", 0, 4)==0
  77. )
  78. return $path;
  79. else return $site.'/'.$path;
  80. }
  81. /**
  82. * @param string $url URL
  83. * @return boolean
  84. */
  85. function ignoreLink($url)
  86. {
  87. return $url[0]=="#" or substr($url, 0, 11) == "javascript:";
  88. }
  89. /**
  90. * @param string $link URL
  91. * @param string $title Title
  92. * @param string $metaData Array
  93. * @param int $depth Depth
  94. * @return void
  95. */
  96. function insertIntoDatabase($link, $title, &$metaData, $depth)
  97. {
  98. //global $crawledLinks;
  99. echo "Inserting new record {URL= ".$link.", Title = '$title', Description = '".$metaData['description']."', Keywords = ' ".$metaData['keywords']."'}<br/><br/><br/>";
  100. //²$crawledLinks[]=$link;
  101. }
  102. /**
  103. * @param string $doc Doc
  104. * @param string $url URL
  105. * @return string URL/Title
  106. */
  107. function getDocTitle(&$doc, $url)
  108. {
  109. $titleNodes=$doc->getElementsByTagName('title');
  110. if (count($titleNodes)==0 or !isset($titleNodes[0]->nodeValue))
  111. return $url;
  112. $title=str_replace('', '\n', $titleNodes[0]->nodeValue);
  113. return (strlen($title)<1)?$url:$title;
  114. }
  115. /**
  116. * @param string $doc Doc
  117. * @return array Array
  118. */
  119. function getDocMetaData(&$doc)
  120. {
  121. $metaData=array();
  122. $metaNodes=$doc->getElementsByTagName('meta');
  123. foreach ($metaNodes as $node)
  124. $metaData[$node->getAttribute("name")] = $node->getAttribute("content");
  125. if (!isset($metaData['description']))
  126. $metaData['description']='No Description Available';
  127. if (!isset($metaData['keywords'])) $metaData['keywords']='';
  128. return array(
  129. 'keywords'=>str_replace('', '\n', $metaData['keywords']),
  130. 'description'=>str_replace('', '\n', $metaData['description'])
  131. );
  132. }
  133. followLink("http://localhost/dolibarr_dev/htdocs");