173 lines
5.9 KiB
C#
173 lines
5.9 KiB
C#
using HtmlAgilityPack;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Net;
|
|
using System.Text;
|
|
using System.Text.Json;
|
|
using System.Threading.Tasks;
|
|
|
|
namespace New_Spider
|
|
{
|
|
public class HtmlAgHelper
|
|
{
|
|
|
|
/// <summary>
|
|
/// 阳光高考完数据解析
|
|
/// </summary>
|
|
public void HtmlCreatePageData()
|
|
{
|
|
HtmlWeb webClient = new HtmlWeb();
|
|
ITextGen textgen = new ITextGen();
|
|
//for (int k = 0; k <= 2800; k += 20)
|
|
for (int k = 0; k <= 1; k += 20)
|
|
{
|
|
var jsonobjs = new List<JsonObj>();
|
|
|
|
HtmlDocument doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-{0}.dhtml", k));
|
|
var rootNode = doc.DocumentNode;
|
|
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]");
|
|
int categoryIndex = 0;
|
|
foreach (var items in categoryNodeList)//
|
|
{
|
|
Thread.Sleep(500);
|
|
categoryIndex++;
|
|
|
|
var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a");
|
|
|
|
Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText));
|
|
|
|
HtmlUniversityAgResolve(doc, webClient, item, item.InnerText.Trim(),jsonobjs);
|
|
}
|
|
// Console.WriteLine(k);
|
|
Thread.Sleep(1000);
|
|
textgen.GenJson(Newtonsoft.Json.JsonConvert.SerializeObject(jsonobjs), k.ToString());
|
|
}
|
|
}
|
|
|
|
|
|
public class JsonObj
|
|
{
|
|
public string id { get; set; }
|
|
public string name { get; set; }
|
|
public string description { get; set; }
|
|
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// 解析院校信息页面
|
|
/// </summary>
|
|
/// <param name="htmldoc"></param>
|
|
/// <param name="htmlWeb"></param>
|
|
/// <param name="htmlNode"></param>
|
|
/// <param name="name"></param>
|
|
public void HtmlUniversityAgResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, HtmlNode htmlNode,string name, List<JsonObj> jsons)
|
|
{
|
|
htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", htmlNode.Attributes["href"].Value));
|
|
|
|
var rootNode = htmldoc.DocumentNode;
|
|
HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[3]/div/a[2]");
|
|
|
|
if (items != null)
|
|
{
|
|
foreach (var item in items)//
|
|
{
|
|
Thread.Sleep(500);
|
|
try
|
|
{
|
|
HtmlUniversityDescriptionResolve(htmldoc, htmlWeb, item.Attributes["href"].Value, name, jsons);
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// 解析院校描述模块
|
|
/// </summary>
|
|
/// <param name="htmldoc"></param>
|
|
/// <param name="htmlWeb"></param>
|
|
/// <param name="htmlNode"></param>
|
|
/// <param name="name"></param>
|
|
public void HtmlUniversityDescriptionResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, string url, string name, List<JsonObj> jsons)
|
|
{
|
|
var href = url;
|
|
htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面
|
|
var rootNode = htmldoc.DocumentNode;
|
|
HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[4]/div[3]");
|
|
if (items != null)
|
|
{
|
|
foreach (var item in items)//
|
|
{
|
|
var description = item.InnerText; //院校简介
|
|
|
|
jsons.Add(new JsonObj()
|
|
{
|
|
description = description,
|
|
name = name,
|
|
id = DateTime.Now.ToString("yyyyMMHHddmmssfff"),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
public void HtmlAg()
|
|
{
|
|
HtmlWeb webClient = new HtmlWeb();
|
|
HtmlDocument doc = webClient.Load("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-0.dhtml");
|
|
var rootNode = doc.DocumentNode;
|
|
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]");
|
|
int categoryIndex = 0;
|
|
foreach (var items in categoryNodeList)//
|
|
{
|
|
|
|
categoryIndex++;
|
|
|
|
var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a");
|
|
Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText));
|
|
|
|
|
|
doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", item.Attributes["href"].Value));
|
|
|
|
var rootNode2 = doc.DocumentNode;
|
|
HtmlNodeCollection categoryNodeList2 = rootNode2.SelectNodes("/html/body/div[1]/div[3]/div/a[2]");
|
|
|
|
foreach (var itemsd in categoryNodeList2)//
|
|
{
|
|
|
|
var href = itemsd.Attributes["href"].Value;
|
|
|
|
doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面
|
|
|
|
var rootNode3 = doc.DocumentNode;
|
|
|
|
HtmlNodeCollection categoryNodeList3 = rootNode3.SelectNodes("/html/body/div[1]/div[4]/div[3]");
|
|
|
|
foreach (var itemsd3 in categoryNodeList3)//
|
|
{
|
|
var contexts = itemsd3.InnerText; //院校简介
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
}
|