前言
上一小节说道,点击版面图片区域,可获取到映射的文章链接。本节将着重实现对文章html的解析,正则匹配走起~
通过对文章html结构的比对,调查发现,文章详情从class="text_c"
的地方开,所以解析文章的函数如下。
代码实现
在utils目录下新建articleExtract.js
var articleObj = {};
var articleExtract = function (html, newsHref, pagenum) {
articleObj["newshref"] = newsHref; //该文章的链接
articleObj["pagenum"] = pagenum; //该文章所在版面编号
var html = html.replace(/<br\/>/ig, "\r\n");
// console.log(articleHtml);
/*正则*/
//文章部分
var titleReg = /<div[^>]+class="text_c"[^>]*>[\s\S]+?<\/div>/i; //会匹配到class=lai的结束</div>为止,所以只能用标题部分的
//来源部分
var sourceReg = /<div[^>]+class="lai"[^>]*>([\s\S]+?)<\/div>/i;
//正文图片
var imgReg = /<table[^>]+class="pci_c"[^>]*>[\s\S]+?<\/table>/ig;
//正文文章
var contentReg = /<!--enpcontent--><p>[\s\S]*?<\/p><!--\/enpcontent-->/i;
//中间变量
// var articleHtml = articleMatch[0];//存放匹配的文章部分的html
var titleHtml = "";
var imgHtmlArray = ""; //存放匹配的正文图片部分的html
var contentHtml = ""; //存放匹配的正文文章部分的html
//正则匹配结果
var titleMatch = html.match(titleReg); //存放匹配的文章的结果
var sourceMatch = html.match(sourceReg); //存放的标题部分的html
var imgMatch = html.match(imgReg); //存放匹配的正文图片部分的html
var contentMatch = html.match(contentReg); //存放匹配的正文文章部分的html
//结果变量
var h1 = ""; //主标题
var h2 = ""; //副标题
var h3 = ""; //引标题
var h4 = ""; //不知道代表啥标题
var source = ""; //来源及日期
var imgArray = []; //图片及图片说明
var contentArray = []; //文章每段的内容
//给中间变量赋值
titleMatch && (titleHtml = titleMatch[0]);
sourceMatch && (source = sourceMatch[1].replace(/\s+/g, ''));
imgMatch && (imgHtmlArray = imgMatch);
contentMatch && (contentHtml = contentMatch[0]);
/***********图片***********************/
if (imgHtmlArray) {
var i;
var imgSrc = ''
var imgDesc = ''
for (i = 0; i < imgHtmlArray.length; i++) {
imgSrc = imgMatch[i].match(/<img src="(.*?)"[^]*>/i)[1].replace("../../../", 'http://paper.people.com.cn/rmrb/');
imgDesc = imgMatch[i].match(/<p>([\s\S]*?)<\/P>/i)[1]
imgArray.push({
imgSrc: imgSrc,
imgDesc: imgDesc
})
}
console.log("图片匹配", imgArray);
}
/*************标题*********************/
h1 = titleHtml.match(/<h1>([\s\S]+?)<\/h1>/i)[1]; //标题肯定存在,所以用 +
h2 = titleHtml.match(/<h2>([\s\S]*?)<\/h2>/i)[1] //副标题不一定存在,所以用 *
h3 = titleHtml.match(/<h3>([\s\S]*?)<\/h3>/i)[1] //引标题不一定存在,所以用 *
h4 = titleHtml.match(/<h4>([\s\S]*?)<\/h4>/i)[1] //h4不知道是啥标题,所以用 *
console.log("标题 ", h1);
console.log("副标题 ", h2);
console.log("引标题 ", h3);
console.log("不知道是啥的h4 ", h4);
console.log("来源", source);
console.log("图片列表", imgHtmlArray);
console.log("文章段落列表", contentHtml);
/*************正文*********************/
if (contentHtml) {
var contents = contentHtml.match(/<p>.*?<\/p>/ig);
var p = {};
var text = "";
var strong = "strong";
//某些新闻没有正文内容(比如广告,只有一张图片),因此需要判断一下
if (contents) {
for (i = 0; i < contents.length; i++) {
var currentP = contents[i];
text = currentP.match(/<p>(.*?)<\/p>/i)[1].replace(/( )+/g, '\t');
if ((text.indexOf('STRONG') != -1) || (text.indexOf('FONT') != -1)) {
text = text.match(/<strong>(.*?)<\/strong>/i)[1].replace(/( )+/g, '\t');
contentArray.push({ "text": text, "strong": strong });
} else {
contentArray.push({ "text": text });
}
}
}
}
articleObj["titleObj"] = {
title: h1,
sub: h2,
quote: h3,
unknown: h4,
source: source
}
articleObj["imgArray"] = imgArray
articleObj['contentArray'] = contentArray
return articleObj
}
module.exports = articleExtract;
修改pages/article/article.js
添加articleExtract
函数
var app = getApp();
//article url
var todayDateArray = require('../../utils/util.js').todayDateArray;
var articleExtract = require('../../utils/articleExtract.js')
var baseUri = "http://paper.people.com.cn/rmrb/html"
//拼接url的变量
//...
Page({
/**
* 页面的初始数据
*/
data: {
articleObj:{}
},
//...
onShow: function () {
// ...
},
//请求文章
getArticle: function (url, newsHref, pagenum) {
var self = this;
var reqObj = { url: url };
wx.request({
url: url,
success:function(res){
var html = res.data;
//解析文章html,获取文章标题、内容等相关信息
var tmpArticleObj = articleExtract(html, newsHref, pagenum);
console.log("文章解析结果", tmpArticleObj )
self.setData({
articleObj: tmpArticleObj,
});
}
});
},
})
文章解析出来了,下一步就是把他显示出来
显示文章
在article.wxml,把内容简单的显示出来
<!--pages/article/article.wxml-->
<view class="page-contain">
<view class="article-contain">
<view class="article-header">
<view wx:if="{{articleObj.titleObj.introTitle}}" class="header-introtitle">{{articleObj.titleObj.introTitle}}</view>
<view wx:if="{{articleObj.titleObj.title}}" class="header-title">{{articleObj.titleObj.title}}</view>
<view wx:if="{{articleObj.titleObj.subTitle}}" class="header-subTitle">{{articleObj.titleObj.subTitle}}</view>
<view wx:if="{{articleObj.titleObj.authors}}" class="header-authors">{{articleObj.titleObj.authors}}</view>
</view>
<view class="article-attachment" wx:if="{{articleObj.imgArray}}">
<view class="attachment-img" wx:for="{{articleObj.imgArray}}">
<image src="{{item.imgSrc}}" mode='widthFix'></image>
<view class="attachment-alt" wx:if="{{item.imgAlt}}">{{item.imgAlt}}</view>
</view>
</view>
<view class="article-content" wx:if="{{articleObj.contentArray}}">
<view class='content-p {{item.strong}}' wx:for="{{articleObj.contentArray}}">
<text decode='true'>{{item.text}}</text>
</view>
</view>
</view>
</view>
至此,文章详情算是显示出来了,下一步,继续完善