using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
namespace New_Spider.阳光高考
{
public class HtmlAgHelper
{
///
/// 阳光高考完数据解析
///
public void HtmlCreatePageData()
{
HtmlWeb webClient = new HtmlWeb();
ITextGen textgen = new ITextGen();
//for (int k = 0; k <= 2800; k += 20)
for (int k = 0; k <= 1; k += 20)
{
var jsonobjs = new List();
HtmlDocument doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-{0}.dhtml", k));
var rootNode = doc.DocumentNode;
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]");
int categoryIndex = 0;
foreach (var items in categoryNodeList)//
{
Thread.Sleep(500);
categoryIndex++;
var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a");
Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText));
HtmlUniversityAgResolve(doc, webClient, item, item.InnerText.Trim(), jsonobjs);
}
// Console.WriteLine(k);
Thread.Sleep(1000);
textgen.GenJson(Newtonsoft.Json.JsonConvert.SerializeObject(jsonobjs), k.ToString());
}
}
public class JsonObj
{
public string id { get; set; }
public string name { get; set; }
public string description { get; set; }
}
///
/// 解析院校信息页面
///
///
///
///
///
public void HtmlUniversityAgResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, HtmlNode htmlNode, string name, List jsons)
{
htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", htmlNode.Attributes["href"].Value));
var rootNode = htmldoc.DocumentNode;
HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[3]/div/a[2]");
if (items != null)
{
foreach (var item in items)//
{
Thread.Sleep(500);
try
{
HtmlUniversityDescriptionResolve(htmldoc, htmlWeb, item.Attributes["href"].Value, name, jsons);
}
catch (Exception e)
{
}
}
}
}
///
/// 解析院校描述模块
///
///
///
///
///
public void HtmlUniversityDescriptionResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, string url, string name, List jsons)
{
var href = url;
htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面
var rootNode = htmldoc.DocumentNode;
HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[4]/div[3]");
if (items != null)
{
foreach (var item in items)//
{
var description = item.InnerText; //院校简介
jsons.Add(new JsonObj()
{
description = description,
name = name,
id = DateTime.Now.ToString("yyyyMMHHddmmssfff"),
});
}
}
}
public void HtmlAg()
{
HtmlWeb webClient = new HtmlWeb();
HtmlDocument doc = webClient.Load("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-0.dhtml");
var rootNode = doc.DocumentNode;
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]");
int categoryIndex = 0;
foreach (var items in categoryNodeList)//
{
categoryIndex++;
var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a");
Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText));
doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", item.Attributes["href"].Value));
var rootNode2 = doc.DocumentNode;
HtmlNodeCollection categoryNodeList2 = rootNode2.SelectNodes("/html/body/div[1]/div[3]/div/a[2]");
foreach (var itemsd in categoryNodeList2)//
{
var href = itemsd.Attributes["href"].Value;
doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面
var rootNode3 = doc.DocumentNode;
HtmlNodeCollection categoryNodeList3 = rootNode3.SelectNodes("/html/body/div[1]/div[4]/div[3]");
foreach (var itemsd3 in categoryNodeList3)//
{
var contexts = itemsd3.InnerText; //院校简介
}
}
}
}
}
}