因为部分原因,需要用到天眼查的数据,虽然开了天眼查的会员账号,但是导出Excel数据字段内容太少,部分例如工商注册号,组织机构代码都没用,感觉有些不完整,所以利用数据库里已有的公司名称来进行采集,将信息页的内容采集完整
吐槽下,天眼查的反扒确实蛮多的,最早用软件采集,但是没有配置采集频率,结果两分钟左右就爬了100个页面就会跳出验证码界面,验证码的图片有点也很讨厌,看到眼睛很花
因为知道明确的公司名称,采集思路就快很多,做了个winform 窗口的,两个按钮,一个开始一个结束,两个文本框,一个用来显示当前的采集url,另一个是显示采集成功的结果累计页
采集的过程就是连接完数据库后,根据查询语句从数据库中捞出相应数据形成List,加一个webBrowser做可视化界面,因为验证码绕不开,需要点击.
总体的思路是:根据公司名称拼凑成url,然后解析html,找到详细页的url,再解析第二次的详细页的html,抓取相应数据,保存到数据库
因为不太懂得多线程的原理,所以就在主线程上设置了休眠1秒,勉强1小时跳一次验证码,采集的速度也蛮很多,相当于手工模拟点击,因为天眼查一定要登录,出现验证码的页面会出现手机号和ip地址,感觉对着两个应该是有限制的.
贴上部分代码作为参考
初始化一些需要的字段
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using HtmlAgilityPack;
using Ivony.Html;
using Ivony.Html.Parser;
using System.Threading;
/// <summary>
/// 替换采集的公司曾用名
/// </summary>
public string company_used_name = string.Empty;
/// <summary>
/// 替换采集的信息更新时间
/// </summary>
public string information_update_time = string.Empty;
/// <summary>
/// 替换采集的公司组织机构代码
/// </summary>
public string organizing_code = string.Empty;
/// <summary>
/// 替换采集的公司类型
/// </summary>
public string company_type = string.Empty;
/// <summary>
/// 替换采集的公司所属行业
/// </summary>
public string industry = string.Empty;
/// <summary>
/// 替换采集的公司营业期限
/// </summary>
public string operating_period = string.Empty;
/// <summary>
/// 替换采集的纳税人资质
/// </summary>
public string taxpayer_qualification = string.Empty;
/// <summary>
/// 替换采集的人员规模
/// </summary>
public string staff_size = string.Empty;
/// <summary>
/// 替换采集的实缴资本
/// </summary>
public string contribute_capital = string.Empty;
/// <summary>
/// 替换采集的登记机关
/// </summary>
public string registration_authority = string.Empty;
/// <summary>
/// 替换采集的参保人数
/// </summary>
public string guarabtee_people = string.Empty;
/// <summary>
/// 替换采集的公司英文名称
/// </summary>
public string english_name = string.Empty;
/// <summary>
/// 用于替换采集到的公司id
/// </summary>
public string company_id = string.Empty;
/// <summary>
/// 用于替换采集到的公司状态
/// </summary>
public string company_state = string.Empty;
/// <summary>
/// 判断按钮状态
/// </summary>
private bool status = false;
/// <summary>
/// 用于替换数据采集页的url
/// </summary>
public string urls = string.Empty;
public int ID;
public int CID = 0;
/// <summary>
/// 是否第一次加载url
/// </summary>
public bool isfrist = true;
/// <summary>
/// 数据库上下文
/// </summary>
DB_UsersEntities db = new DB_UsersEntities();
//数据库里的对应的表
public List<_Enterprise_TianYanCha> list;
初始界面,第一次登陆的时候,记录账号信息的
/// <summary>
///winfro 界面加载执行方法
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void Form1_Load(object sender, EventArgs e)
{
Uri Url = new Uri("https://www.tianyancha.com/search?key=洛阳市伊滨区李村镇智华装饰行");
this.webBrowser1.Navigate(Url);
this.webBrowser1.ScriptErrorsSuppressed = true;
}
其他的函数
// <summary>
/// 将异常打印到LOG文件
/// </summary>
/// <param name="ex">异常</param>
/// <param name="url1">当前出错时的url</param>
/// <param name="LogAddress">日志文件地址</param>
public static void WriteLog(Exception ex,string url1, string LogAddress = "")
{
//如果日志文件为空,则默认在Debug目录下新建 YYYY-mm-dd_Log.log文件
if (LogAddress == "")
{
LogAddress = Environment.CurrentDirectory + '\\' +
DateTime.Now.Year + '-' +
DateTime.Now.Month + '-' +
DateTime.Now.Day + "_Log.log";
}
//把异常信息输出到文件
StreamWriter fs = new StreamWriter(LogAddress, true);
fs.WriteLine("当前url:" + url1);
fs.WriteLine("当前时间:" + DateTime.Now.ToString());
fs.WriteLine("异常信息:" + ex.Message);
fs.WriteLine("异常对象:" + ex.Source);
fs.WriteLine("调用堆栈:\n" + ex.StackTrace.Trim());
fs.WriteLine("触发方法:" + ex.TargetSite);
fs.WriteLine();
fs.Close();
}
/// <summary>
/// 取代循环,防止循环过快导致线程被占用
/// </summary>
/// <param name="i"></param>
public void At(int i)
{
var item = list[i];
if (item != null)
{
ID = item.Id;
string str = $"https://www.tianyancha.com/search?key={item.Company_Name}";//搜索公司信息结果页面的URL
this.textBox1.Text = str;
Uri Url = new Uri(str);
this.webBrowser1.Navigate(Url);
CID++;
}
}
#region MyRegion
public string _GetHtml_WebBrowser(string rqurl)
{
WebBrowser browser = new WebBrowser();
browser.ScriptErrorsSuppressed = true;
browser.Navigate(rqurl);
while (browser.ReadyState != WebBrowserReadyState.Complete)
System.Windows.Forms.Application.DoEvents();
return browser.DocumentText;
}
/// <summary>
/// 文本框加载的滚动条置底
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void textBox3_TextChanged(object sender, EventArgs e)
{
this.textBox3.SelectionStart = this.textBox3.Text.Length;
this.textBox3.SelectionLength = 0;
this.textBox3.ScrollToCaret();
if (this.textBox3.Text.Length > 500)
{
textBox3.Clear();
}
}
/// <summary>
/// 停止按钮按下时,状态值改变
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button2_Click(object sender, EventArgs e)
{
status = false;
}
主程序
/// <summary>
/// button1 事件方法
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void button1_Click(object sender, EventArgs e)
{
status = true;
if (status == true)//状态值为true时程序才执行
{
isfrist = false;
list = db._Enterprise_TianYanCha.Where(a => a.Organizing_Code == "").OrderBy(a => a.Id).Skip(00001).Take(10000).ToList();
At(CID);
}
}
/// <summary>
/// webBrowser1 加载完成执行
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
if (status == true)
{
try
{
if (isfrist)
return;
var urla = this.webBrowser1.Url.ToString();
if (urla.Contains("https://www.tianyancha.com/search?key"))
{
// 加载解析列表页
var urlhtml = this.webBrowser1.Document.Body.OuterHtml;
var parser = new JumonyParser();
var document = parser.Parse(urlhtml);
if (urlhtml.Contains("id=\"search"))
{
var rt = document.Find("#search .tips-num").FirstOrDefault().InnerText();
int result = int.Parse(rt.ToString());//搜索结果条数
if (result == 1)//搜索结果为1条时
{
//获取公司数据详细页的url
string turl = "";
var a = document.Find("#web-content .container-left .search-block .result-list .search-item").FirstOrDefault().OuterHtml();
var d1 = parser.Parse(a);
turl = d1.Find(".search-result-single .content a.name").FirstOrDefault().Attribute("href").AttributeValue;
//获取公司id
string cpid = "";
cpid = d1.Find(".search-result-single").FirstOrDefault().Attribute("data-id").AttributeValue;
company_id = cpid;
Uri lasturl = new Uri(turl);
this.webBrowser1.Navigate(turl);
urls = turl;
}
else if (result < 5000)//搜索结果多条时,默认匹配第一条数据中的url
{
//获取公司数据详细页的url
string turl = "";
var a = document.Find("#web-content .container-left .search-block .result-list .search-item").FirstOrDefault().OuterHtml();
var d1 = parser.Parse(a);
turl = d1.Find(".search-result-single .content a.name").FirstOrDefault().Attribute("href").AttributeValue;
//获取公司id
string cpid = "";
cpid = d1.Find(".search-result-single").FirstOrDefault().Attribute("data-id").AttributeValue;
company_id = cpid;
Uri lasturl = new Uri(turl);
this.webBrowser1.Navigate(turl);
urls = turl;
}
else
{
MessageBox.Show("要重新点击“开始”按钮");
}
}
}
else if (urla.Contains("https://www.tianyancha.com/company/"))
{
var rturlhtml = this.webBrowser1.Document.Body.OuterHtml;
var rtparser = new JumonyParser();
var rtdocument = rtparser.Parse(rturlhtml);
//获取曾用名
string cpun = "";
var cpun1 = rtdocument.Find("#company_web_top .box .content").FirstOrDefault().OuterHtml();
var cpun2 = rtparser.Parse(cpun1);
var cpun3 = cpun2.Find(".tag-list").ToList()[1].InnerText();
var cpun4 = "";
if (cpun3 == "")
{
company_used_name = cpun;
}
else
{
cpun4 = cpun2.Find(".tag-list").ToList()[1].Find(".history-content").FirstOrDefault().InnerText();
company_used_name = cpun3;
}
//获取工商注册号
string buss_num = "";
var bussn = rtdocument.Find("#_container_baseInfo .table.-striped-col").FirstOrDefault().OuterHtml();
var bussn1 = rtparser.Parse(bussn);
var buss_num1 = bussn1.Find("tbody td").ToList()[1].InnerText();
buss_num = buss_num1;
//获取公司状态
string cpst = "";
var c = rtdocument.Find("#_container_baseInfo .table").FirstOrDefault().OuterHtml();
var c1 = rtparser.Parse(c);
cpst = c1.Find("tbody tr").ToList()[2].Find("div").ToList()[1].InnerText();
company_state = cpst;
//获取信息更新时间
var infout = rtdocument.Find("#company_web_top .footer .refesh.float-left .updatetimeComBox").FirstOrDefault().InnerText();
information_update_time = infout;
//获取组织机构代码
var oc = bussn1.Find("tbody td").ToList()[3].InnerText();
organizing_code = oc;
//获取公司类型
var cptype = bussn1.Find("tbody tr").ToList()[1].Find("td").ToList()[3].InnerText();
company_type = cptype;
//获取公司所属行业
var ind = bussn1.Find("tbody tr").ToList()[2].Find("td").ToList()[3].InnerText();
industry = ind;
//获取营业期限
var opp = bussn1.Find("tbody tr").ToList()[3].Find("td").ToList()[1].InnerText();
operating_period = opp;
//获取纳说人资质
var taxp = bussn1.Find("tbody tr").ToList()[4].Find("td").ToList()[1].InnerText();
taxpayer_qualification = taxp;
//获取人员规模
var ss = bussn1.Find("tbody tr").ToList()[4].Find("td").ToList()[3].InnerText();
staff_size = ss;
//获取实缴资本
var cc = bussn1.Find("tbody tr").ToList()[5].Find("td").ToList()[1].InnerText();
contribute_capital = cc;
//获取登记局机关
var ra = bussn1.Find("tbody tr").ToList()[5].Find("td").ToList()[3].InnerText();
registration_authority = ra;
//获取参保人数
var gp = bussn1.Find("tbody tr").ToList()[6].Find("td").ToList()[1].InnerText();
guarabtee_people = gp;
//获取公司英文名称
var en = bussn1.Find("tbody tr").ToList()[6].Find("td").ToList()[3].InnerText();
english_name = en;
//保存到数据库
var item = db._Enterprise_TianYanCha.FirstOrDefault(a => a.Id == ID);
item.Business_Registration_Number = buss_num;
item.Company_Used_Name = company_used_name;
item.Information_Update_Time = DateTime.Parse(information_update_time);
item.Organizing_Code = organizing_code;
item.Company_Type = company_type;
item.Industry = industry;
item.Operating_Period = operating_period;
item.Taxpayer_Qualification = taxpayer_qualification;
item.Staff_Size = staff_size;
item.Contribute_Capital = contribute_capital;
item.Registration_Authority = registration_authority;
item.Guarantee_People = guarabtee_people;
item.English_Name = english_name;
item.Company_Id = company_id;
item.Company_State = company_state;
db.SaveChanges();
urls = string.Empty;
this.textBox3.Text += $"{ID.ToString()}处理完成!!!\r\n";
At(CID);
Thread.Sleep(1000);
}
}
catch (Exception ex)
{
var url1 = this.webBrowser1.Url.ToString();
WriteLog(ex, url1,"");
At(CID);
Thread.Sleep(1000);
}
}
}
细节上可能还有不足,至少可以跑起来,为了速度快些,就多布几台电脑就好了