火车头采集伪原创插件PHP版实现

很多码农都有自己的个人博客，平时谢谢笔记什么的，今天我不是谈如何做博客，而是谈如何做搜索引擎优化，结合自己多年的PHP经验来实现网站自动优化。
做一个原始网站或伪原创不仅仅是整合文本。SEO优化的原始文章与添加或删除内容无关。我喜欢用火车头来实现内容爬取，然后自己写一个伪原创插件，有兴趣大家可以自己实践下。
准备撰写文章时，请务必在撰写之前考虑本文的标题和布局。本文的标题可能基于哪个标题，并且由优化程序撰写的文章也考虑添加““锚文本”或“软链接”，因为在写文章之前，这些是要注意的事情。
伪原创或原始文章必须引起用户的注意，而不是假冒作品来欺骗搜索蜘蛛。因此，高质量的文章应具有清晰的结构，清晰的主题选择，清晰的布局并根据主题撰写文章。还可以为优化人员提供时间，并为网站带来良好的优化结果。
下面是火车头伪原创插件PHP代码：
复制代码
<?php
set_time_limit(270);
error_reporting(E_ERROR | E_WARNING | E_PARSE);
define('TITLE_SEPAR', 'xxx**xxx');
// 这里的key要自己去小发猫那里注册一下。
$url = 'http://api.xiaofamao.com/api.php?json=0&v=1&key=xxxxxxx';
switch($LabelArray['PageType'])
{
 case 'List'://处理列表页，只能处理html
 break;
 case 'Pages'://处理多页，只能处理html
 break;
 case 'Content'://处理默认页，只能处理html
 break;
 case 'Save'://只有保存时是可以处理标签值的
 // 保存原文
 try {
 /**********************************************************************/
 // 这一步用来获取伪原创文章
 /**********************************************************************/
 $article_src = $LabelArray['标题'] . TITLE_SEPAR . $LabelArray['内容'];
 $article_src_b = $article_src;
 $article_src = br2newline($article_src);
 $article_new = curl_request($url, array('wenzhang'=>$article_src));
 $article_new_x = $article_new;
 $article_new = fix_newline($article_new);
 $temp = explode(TITLE_SEPAR, $article_new);
 $new_title = $temp[0];
 $new_title = fix_title($new_title);
 /*
 $temp[1] = ltrim($temp[1], "\r\n");//
 $temp[1] = ltrim($temp[1], "\n");
 $temp[1] = ltrim($temp[1], "\r\n");//implode(PHP_EOL, $temp);
 $temp[1] = ltrim($temp[1], "\n");*/
 $new_article = $temp[1];
 $new_article = delete_newline($new_article);
// $new_article = newline2br($new_article);
 //$new_article = remove_alt($new_article);
 //$article_new = xfm_strong_str_replace_once('', ''.$new_title, $new_article);
 //$LabelArray['内容'] = $article_new;//$new_article;//$new_article;
 $LabelArray['内容'] = $new_article;//$article_src_b;//$article_new_x;//$article_new;
// $LabelArray['内容'] = $temp[1];
 //$LabelArray['内容'] = $article_src;
 $new_title = str_replace(array('[',']','％'), array('【','】','%'), $new_title);
 #$LabelArray['标题'] = $new_title;
 //$LabelArray['摘要'] = curl_request($url, array('wenzhang'=>$LabelArray['标题'].','.$LabelArray['摘要']));
 }
 catch (Exception $e) {
 $LabelArray['标题'] .= $e->getMessage();
 $LabelArray['内容'] .= $e->getMessage();
 }
 break;
 default:
 //$LabelArray['内容']=curl_request($url, array('wenzhang'=>$LabelArray['内容'] ));
}
echo serialize($LabelArray);
function remove_alt($contents) {
 $contents = preg_replace('/alt=\"(.*)\"/', '', $contents);
 return $contents;
}
function fix_title($contents) {
 $punctuation_symbol = array('。', '？', '，', '：', '；', '、', '！',
 '.', '?', ',', ':', ';', '!');
 $contents = str_replace($punctuation_symbol, '', $contents);
 return $contents;
}
function br2newline($contents) {
 $contents = str_replace(' ', PHP_EOL, $contents);
 $contents = str_replace(' ', PHP_EOL, $contents);
 $contents = str_replace(' ', PHP_EOL, $contents);
 $contents = str_replace(' ', PHP_EOL, $contents);
 $contents = str_replace(' ', PHP_EOL, $contents);
 $contents = str_replace(' ', PHP_EOL, $contents);
 return $contents;
}
function newline2br($contnets) {
 $contnets = str_replace(PHP_EOL, " ", $contnets);
// $contnets = str_replace('> <', '><', $contnets);
 $contnets = str_replace(' ', '', $contnets);
 return $contnets;
}
function delete_newline($contents) {
 $contents = fix_newline($contents);
// $contents = str_replace(PHP_EOL.PHP_EOL, PHP_EOL, $contents);
// $contents = str_replace('>'.PHP_EOL, '>', $contents);
 return $contents;
}
function reset_newline_win($contents) {
 // 优化换行符
 $contents = str_replace("\r\n", "\n", $contents);
 $contents = str_replace("\r", "\n", $contents);
 $contents = str_replace("\n", PHP_EOL, $contents);
 return $contents;
}
function fix_newline($data) {
 $data = str_replace("\r", "\n", $data);
 while(strpos($data, "\n\n") !== false) {
 $data = str_replace("\n\n", "\n", $data);
 }
 $data = str_replace("\n", PHP_EOL, $data);
 return $data;
}
function clean_contents($contents) {
// $str = preg_replace('#<([^>\s/]+)[^>]*>#','<$1>', $contents);
// return $str;
 $sa = new cleanHtml;
 $sa->allow = array( 'src' );
 $sa->exceptions = array(
 'img' => array( 'src', 'alt' ),
 //'a' => array( 'href', 'title' ),
 'iframe'=>array('src','frameborder'),
 );
 $str = $sa->strip( $contents );
 return $str;
}
function xfm_strong_str_replace_once($search, $replace, $subject) {
 $firstChar = strpos($subject, $search);
 if($firstChar !== false) {
 $beforeStr = substr($subject,0,$firstChar);
 $afterStr = substr($subject, $firstChar + strlen($search));
 return $beforeStr.$replace.$afterStr;
 } else {
 return $subject;
 }
}
//参数1：访问的URL，参数2：post数据(不填则为GET)，参数3：提交的$cookies,参数4：是否返回$cookies
function curl_request($url,$post='',$cookie='', $returnCookie=0){
 if (! extension_loaded('curl')) {
 file_exists('./ext/php_curl.dll') && dl('php_curl.dll'); // 加载扩展
 }

 $curl = curl_init();
 curl_setopt($curl, CURLOPT_URL, $url);
 curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');
 if (ini_get('open_basedir') == '' && strtolower(ini_get('safe_mode')) != 'on'){
 curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
 }
 curl_setopt($curl, CURLOPT_AUTOREFERER, 1);
 curl_setopt($curl, CURLOPT_REFERER, "http://XXX");
 if($post) {
 curl_setopt($curl, CURLOPT_POST, 1);
 curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));
 }
 if($cookie) {
 curl_setopt($curl, CURLOPT_COOKIE, $cookie);
 }
 curl_setopt($curl, CURLOPT_HEADER, $returnCookie);
 curl_setopt($curl, CURLOPT_TIMEOUT, 150);
 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
 $data = curl_exec($curl);
 if (curl_errno($curl)) {
 return curl_error($curl);
 }
 curl_close($curl);
 if($returnCookie){
 list($header, $body) = explode("\r\n\r\n", $data, 2);
 preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches);
 $info['cookie'] = substr($matches[1][0], 1);
 $info['content'] = $body;
 return $info;
 }else{
 return $data;
 }
}
//echo $tag;
// 计算中文字符串长度
function utf8_strlen($string = null) {
// 将字符串分解为单元
preg_match_all("/./us", $string, $match);
// 返回单元个数
return count($match[0]);
}
function reg_escape( $str )
{
 $conversions = array( "^" => "\^", "[" => "\[", "." => "\.", "$" => "\$", "{" => "\{", "*" => "\*", "(" => "$", "\\" => "\\\\", "/" => "\/", "+" => "\+", ")" => "$", "|" => "\|", "?" => "\?", "<" => "\<", ">" => "\>" );
 return strtr( $str, $conversions );
}

/**
* Strip attribute Class
* Remove attributes from XML elements
* @author David (semlabs.co.uk)
* @version 0.2.1
*/

class cleanHtml{

 public $str = '';
 public $allow = array();
 public $exceptions = array();
 public $ignore = array();

 public function strip( $str )
 {
 $this->str = $str;

 if( is_string( $str ) && strlen( $str ) > 0 )
 {
 $res = $this->findElements();
 if( is_string( $res ) )
 return $res;
 $nodes = $this->findAttributes( $res );
 $this->removeAttributes( $nodes );
 }

 return $this->str;
 }

 private function findElements()
 {
 # Create an array of elements with attributes
 $nodes = array();
 preg_match_all( "/<([^ !\/\>\n]+)([^>]*)>/i", $this->str, $elements );
 foreach( $elements[1] as $el_key => $element )
 {
 if( $elements[2][$el_key] )
 {
 $literal = $elements[0][$el_key];
 $element_name = $elements[1][$el_key];
 $attributes = $elements[2][$el_key];
 if( is_array( $this->ignore ) && !in_array( $element_name, $this->ignore ) )
 $nodes[] = array( 'literal' => $literal, 'name' => $element_name, 'attributes' => $attributes );
 }
 }

 # Return the XML if there were no attributes to remove
 if( !$nodes[0] )
 return $this->str;
 else
 return $nodes;
 }

 private function findAttributes( $nodes )
 {

 # Extract attributes
 foreach( $nodes as &$node )
 {
 preg_match_all( "/([^ =]+)\s*=\s*[\"|']{0,1}([^\"']*)[\"|']{0,1}/i", $node['attributes'], $attributes );
 if( $attributes[1] )
 {
 foreach( $attributes[1] as $att_key => $att )
 {
 $literal = $attributes[0][$att_key];
 $attribute_name = $attributes[1][$att_key];
 $value = $attributes[2][$att_key];
 $atts[] = array( 'literal' => $literal, 'name' => $attribute_name, 'value' => $value );
 }
 }
 else
 $node['attributes'] = null;

 $node['attributes'] = $atts;
 unset( $atts );
 }

 return $nodes;
 }

 private function removeAttributes( $nodes )
 {

 # Remove unwanted attributes
 foreach( $nodes as $node )
 {

 # Check if node has any attributes to be kept
 $node_name = $node['name'];
 $new_attributes = '';
 if( is_array( $node['attributes'] ) )
 {
 foreach( $node['attributes'] as $attribute )
 {
 if( ( is_array( $this->allow ) && in_array( $attribute['name'], $this->allow ) ) || $this->isException( $node_name, $attribute['name'], $this->exceptions ) )
 $new_attributes = $this->createAttributes( $new_attributes, $attribute['name'], $attribute['value'] );
 }
 }
 $replacement = ( $new_attributes ) ? "<$node_name $new_attributes>" : "<$node_name>";
 $this->str = preg_replace( '/'. reg_escape( $node['literal'] ) .'/', $replacement, $this->str );
 }

 }

 private function isException( $element_name, $attribute_name, $exceptions )
 {
 if( array_key_exists($element_name, $this->exceptions) )
 {
 if( in_array( $attribute_name, $this->exceptions[$element_name] ) )
 return true;
 }

 return false;
 }

 private function createAttributes( $new_attributes, $name, $value )
 {
 if( $new_attributes )
 $new_attributes .= " ";
 $new_attributes .= "$name=\"$value\"";

 return $new_attributes;
 }

}
?>

标签：

本文链接：http://78moban.cn/post/17086.html

上一篇：火车头伪原创php-火车头采集伪原创插件

下一篇：返回列表

火车头采集伪原创插件PHP版实现

robot