using HtmlAgilityPack; using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Text; using System.Text.Json; using System.Threading.Tasks; namespace New_Spider { public class HtmlAgHelper { /// /// 阳光高考完数据解析 /// public void HtmlCreatePageData() { HtmlWeb webClient = new HtmlWeb(); ITextGen textgen = new ITextGen(); //for (int k = 0; k <= 2800; k += 20) for (int k = 0; k <= 1; k += 20) { var jsonobjs = new List(); HtmlDocument doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-{0}.dhtml", k)); var rootNode = doc.DocumentNode; HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]"); int categoryIndex = 0; foreach (var items in categoryNodeList)// { Thread.Sleep(500); categoryIndex++; var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a"); Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText)); HtmlUniversityAgResolve(doc, webClient, item, item.InnerText.Trim(),jsonobjs); } // Console.WriteLine(k); Thread.Sleep(1000); textgen.GenJson(Newtonsoft.Json.JsonConvert.SerializeObject(jsonobjs), k.ToString()); } } public class JsonObj { public string id { get; set; } public string name { get; set; } public string description { get; set; } } /// /// 解析院校信息页面 /// /// /// /// /// public void HtmlUniversityAgResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, HtmlNode htmlNode,string name, List jsons) { htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", htmlNode.Attributes["href"].Value)); var rootNode = htmldoc.DocumentNode; HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[3]/div/a[2]"); if (items != null) { foreach (var item in items)// { Thread.Sleep(500); try { HtmlUniversityDescriptionResolve(htmldoc, htmlWeb, item.Attributes["href"].Value, name, jsons); } catch (Exception e) { } } } } /// /// 解析院校描述模块 /// /// /// /// /// public void HtmlUniversityDescriptionResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, string url, string name, List jsons) { var href = url; htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面 var rootNode = htmldoc.DocumentNode; HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[4]/div[3]"); if (items != null) { foreach (var item in items)// { var description = item.InnerText; //院校简介 jsons.Add(new JsonObj() { description = description, name = name, id = DateTime.Now.ToString("yyyyMMHHddmmssfff"), }); } } } public void HtmlAg() { HtmlWeb webClient = new HtmlWeb(); HtmlDocument doc = webClient.Load("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-0.dhtml"); var rootNode = doc.DocumentNode; HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]"); int categoryIndex = 0; foreach (var items in categoryNodeList)// { categoryIndex++; var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a"); Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText)); doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", item.Attributes["href"].Value)); var rootNode2 = doc.DocumentNode; HtmlNodeCollection categoryNodeList2 = rootNode2.SelectNodes("/html/body/div[1]/div[3]/div/a[2]"); foreach (var itemsd in categoryNodeList2)// { var href = itemsd.Attributes["href"].Value; doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面 var rootNode3 = doc.DocumentNode; HtmlNodeCollection categoryNodeList3 = rootNode3.SelectNodes("/html/body/div[1]/div[4]/div[3]"); foreach (var itemsd3 in categoryNodeList3)// { var contexts = itemsd3.InnerText; //院校简介 } } } } } }