diff --git a/New_Spider/Program.cs b/New_Spider/Program.cs index 1598caa..b8f548b 100644 --- a/New_Spider/Program.cs +++ b/New_Spider/Program.cs @@ -23,7 +23,13 @@ using System.Text.RegularExpressions; // secret_key = "AeHT1p4GzDYZuaBLZYOBa0r2Npp22uWv" //}, "https://api.xiaoe-tech.com/"); //Console.WriteLine(result.ToJson()); -Console.Read(); + // See https://aka.ms/new-console-template for more information -Console.WriteLine("Hello, World!"); + + +XueHtmlAgHelper xueHtmlAg = new XueHtmlAgHelper(); +//xueHtmlAg.GetDataRecruitDetail(); +Console.WriteLine("success!"); +Console.Read(); + diff --git a/New_Spider/xuexingaokao/XueHtmlAgHelper.cs b/New_Spider/xuexingaokao/XueHtmlAgHelper.cs new file mode 100644 index 0000000..2bc8ddf --- /dev/null +++ b/New_Spider/xuexingaokao/XueHtmlAgHelper.cs @@ -0,0 +1,537 @@ +using Aliyun.OSS; +using HtmlAgilityPack; +using NPOI.Util.ArrayExtensions; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net.Http; +using System.Text; +using System.Text.Json; +using System.Threading.Tasks; + +namespace New_Spider +{ + public class XueHtmlAgHelper + { + private string bucketName = "static-data-ycymedu"; // + private string filePrefix = "college-data/page-"; + // 初始化 OSS 客户端 + private OssClient ossClient = new OssClient("https://oss-cn-shanghai.aliyuncs.com", "LTAI5tKs3TXSbt7E4WMDcxwR", "EvC8MjRaQC1kHubgU4MtecZnofOb0v"); + + /// + /// 阳光高考完数据解析 + /// + public async Task HtmlCreatePageData() + { + // API 配置 + string baseUrl = "http://api.xuexingaokao.com/api/college/base_college/index"; + int totalPages = 287; + string filePrefix = "college-data/page-"; + using var httpClient = new HttpClient(); + for (int page = 1; page <= totalPages; page++) + { + try + { + // 动态构建 API URL + var url = $"{baseUrl}?area=&type=&nature=&grade=&is_bz=&cname=&page={page}&perpage=10×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; + Console.WriteLine($"Fetching page {page}..."); + + // 获取 JSON 数据(同步请求) + var response = httpClient.GetAsync(url).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = response.Content.ReadAsStringAsync().Result; + + // 构建文件名和路径 + string objectName = $"{filePrefix}{page}.json"; + + // 上传到 OSS(同步上传) + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); + ossClient.PutObject(bucketName, objectName, stream); + + Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); + } + else + { + Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); + } + } + catch (Exception ex) + { + Console.WriteLine($"Error fetching or uploading page {page}: {ex.Message}"); + } + } + } + + + public async Task GetDataDetail() + { + int totalPages = 287; + string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; + string detailUrl = "http://api.xuexingaokao.com/api/college/base_college/collegeDetail?college_id="; + string facturl = "http://api.xuexingaokao.com/api/college/base_college/faculty?college_id=685×tamp=1733820969"; + string subjectIntroduceurl = "http://api.xuexingaokao.com/api/college/base_college/subjectIntroduce?college_id=685×tamp=1733821065"; + string recruitListUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitList?college_id=685×tamp=1733821136"; + using var httpClient = new HttpClient(); + + for (int page = 1; page <= totalPages; page++) + { // 动态构建 API URL + var url = $"{baseUrl}{filePrefix}{page}.json"; + Console.WriteLine($"Fetching page {page}..."); + var response = httpClient.GetAsync(url).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); + if (jsonData?.code == 1) + { + string filePrefix2 = "college-detail/"; + jsonData.data.data.ToList().ForEach(a => + { + // 动态构建 API URL + var detailgourl = $"{detailUrl}{a.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; + Console.WriteLine($"detailgourl {a.id}..."); + // 获取 JSON 数据(同步请求) + var response = httpClient.GetAsync(detailgourl).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = response.Content.ReadAsStringAsync().Result; + // 构建文件名和路径 + string objectName = $"{filePrefix2}{a.id}.json"; + // 上传到 OSS(同步上传) + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); + ossClient.PutObject(bucketName, objectName, stream); + Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); + } + else + { + Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); + } + }); + } + } + } + } + + + /// + /// 缺失详情页 + /// + public void GetDataRecruitList() + { + int totalPages = 287; + string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; + string recruitListUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitList?college_id="; + using var httpClient = new HttpClient(); + for (int page = 1; page <= totalPages; page++) + { // 动态构建 API URL + var url = $"{baseUrl}{filePrefix}{page}.json"; + Console.WriteLine($"recruitListUrl page {page}..."); + var response = httpClient.GetAsync(url).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); + if (jsonData?.code == 1) + { + + string filePrefix2 = "college-recruitlist/"; + jsonData.data.data.ToList().ForEach(a => + { + Thread.Sleep(200); + // 动态构建 API URL + var gourl = $"{recruitListUrl}{a.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; + Console.WriteLine($"recruitListUrl {a.id}..."); + // 获取 JSON 数据(同步请求) + var response = httpClient.GetAsync(gourl).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = response.Content.ReadAsStringAsync().Result; + // 构建文件名和路径 + string objectName = $"{filePrefix2}{a.id}.json"; + // 上传到 OSS(同步上传) + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); + ossClient.PutObject(bucketName, objectName, stream); + Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); + } + else + { + Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); + } + }); + } + } + } + } + + + + + public void GetDataSubjectIntroduce() + { + int totalPages = 287; + string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; + string subjectIntroduceurl = "http://api.xuexingaokao.com/api/college/base_college/subjectIntroduce?college_id="; + using var httpClient = new HttpClient(); + for (int page = 1; page <= totalPages; page++) + { // 动态构建 API URL + var url = $"{baseUrl}{filePrefix}{page}.json"; + Console.WriteLine($"subjectIntroduceurl page {page}..."); + var response = httpClient.GetAsync(url).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); + if (jsonData?.code == 1) + { + + string filePrefix2 = "college-subjectIntroduce/"; + jsonData.data.data.ToList().ForEach(a => + { + Thread.Sleep(200); + // 动态构建 API URL + var gourl = $"{subjectIntroduceurl}{a.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; + Console.WriteLine($"subjectIntroduceurl {a.id}..."); + // 获取 JSON 数据(同步请求) + var response = httpClient.GetAsync(gourl).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = response.Content.ReadAsStringAsync().Result; + // 构建文件名和路径 + string objectName = $"{filePrefix2}{a.id}.json"; + // 上传到 OSS(同步上传) + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); + ossClient.PutObject(bucketName, objectName, stream); + Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); + } + else + { + Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); + } + }); + } + } + } + } + + + /// + /// + /// + /// + public void GetDataFaculty() + { + int totalPages = 287; + string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; + string facturl = "http://api.xuexingaokao.com/api/college/base_college/faculty?college_id="; + using var httpClient = new HttpClient(); + for (int page = 1; page <= totalPages; page++) + { // 动态构建 API URL + var url = $"{baseUrl}{filePrefix}{page}.json"; + Console.WriteLine($"Fetching page {page}..."); + var response = httpClient.GetAsync(url).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); + if (jsonData?.code == 1) + { + + string filePrefix2 = "college-faculty/"; + jsonData.data.data.ToList().ForEach(a => + { + Thread.Sleep(300); + // 动态构建 API URL + var gourl = $"{facturl}{a.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; + Console.WriteLine($"facultyurl {a.id}..."); + // 获取 JSON 数据(同步请求) + var response = httpClient.GetAsync(gourl).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = response.Content.ReadAsStringAsync().Result; + // 构建文件名和路径 + string objectName = $"{filePrefix2}{a.id}.json"; + // 上传到 OSS(同步上传) + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); + ossClient.PutObject(bucketName, objectName, stream); + Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); + } + else + { + Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); + } + }); + } + } + } + } + + + + + /// + /// 招生简介详情 + /// + public void GetDataRecruitDetail() + { + + int totalPages = 287; + string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; + string recruitUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitDetail?recruit_id="; + using var httpClient = new HttpClient(); + string filePrefix2 = "college-recruitlist/"; + string filePrefix3 = "college-recruitdetail/"; + for (int page = 1; page <= totalPages; page++) + { // 动态构建 API URL + var url = $"{baseUrl}{filePrefix}{page}.json"; + Console.WriteLine($"recruitListUrl page {page}..."); + var response = httpClient.GetAsync(url).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); + if (jsonData?.code == 1) + { + jsonData.data.data.ToList().ForEach(a => + { + string objectUrl = $"{baseUrl}{filePrefix2}{a.id}.json"; + Console.WriteLine($"down page {objectUrl}"); + var responseitem = httpClient.GetAsync(objectUrl).Result; + if (responseitem.IsSuccessStatusCode) + { + try + { + var items = JsonSerializer.Deserialize(responseitem.Content.ReadAsStringAsync().Result); + if (items != null) + { + if (items.code == 1) + { + items.data.recruit.ToList().ForEach(e => + { + Thread.Sleep(150); + + try + { + var itemGoUrl = $"{recruitUrl}{e.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; + var response = httpClient.GetAsync(itemGoUrl).Result; + if (response.IsSuccessStatusCode) + { + var jsonData = response.Content.ReadAsStringAsync().Result; + // 构建文件名和路径 + string objectName = $"{filePrefix3}{e.id}.json"; + // 上传到 OSS(同步上传) + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); + ossClient.PutObject(bucketName, objectName, stream); + Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); + } + else + { + Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); + } + } + catch (Exception ex) + { + Console.WriteLine(ex.Message); + } + }); + + + } + } + } + catch (Exception exx) + { + Console.WriteLine(exx.Message); + Console.WriteLine($"{objectUrl}解析失败"); + } + + } + }); + } + } + } + + } + + + + } + +} + + + +public class universitylistobject +{ + public int code { get; set; } + public string msg { get; set; } + public Data data { get; set; } +} + +public class Data +{ + public int total { get; set; } + public string has_next { get; set; } + public string page { get; set; } + public string perpage { get; set; } + public Params _params { get; set; } + public Paramslist paramsList { get; set; } + public Datum[] data { get; set; } +} + +public class Params +{ + public string area { get; set; } + public string type { get; set; } + public string nature { get; set; } + public string grade { get; set; } + public string is_bz { get; set; } + public string cname { get; set; } + public string page { get; set; } + public string perpage { get; set; } + public string timestamp { get; set; } +} + +public class Paramslist +{ + public Arealist[] arealist { get; set; } + public Typelist[] typelist { get; set; } + public Naturelist[] naturelist { get; set; } + public Xingzhilist[] xingzhilist { get; set; } + public Classlist[] classlist { get; set; } +} + +public class Arealist +{ + public int id { get; set; } + public string area { get; set; } + public string province { get; set; } + public string city { get; set; } + public int create_time { get; set; } + public string text { get; set; } + public Child[] children { get; set; } +} + +public class Child +{ + public int id { get; set; } + public string area { get; set; } + public string province { get; set; } + public string city { get; set; } + public int create_time { get; set; } + public string text { get; set; } +} + +public class Typelist +{ + public int id { get; set; } + public string type_name { get; set; } + public int create_time { get; set; } + public int list_order { get; set; } +} + +public class Naturelist +{ + public int id { get; set; } + public string nature_name { get; set; } + public int create_time { get; set; } + public int list_order { get; set; } + public int staue { get; set; } +} + +public class Xingzhilist +{ + public int id { get; set; } + public string nature_name { get; set; } + public int create_time { get; set; } + public int list_order { get; set; } + public int staue { get; set; } +} + +public class Classlist +{ + public int id { get; set; } + public string nature_name { get; set; } + public int create_time { get; set; } + public int list_order { get; set; } + public int staue { get; set; } +} + +public class Datum +{ + public int id { get; set; } + public string college_name { get; set; } + public string college_name_display { get; set; } + public string rk_ranking_2024 { get; set; } + public int rk_ranking_list { get; set; } + public string college_class { get; set; } + public string college_nature { get; set; } + public Detail detail { get; set; } + public string rk_ranking { get; set; } +} + +public class Detail +{ + public int id { get; set; } + public string college_name { get; set; } + public string college_logo { get; set; } + public object ex_rate { get; set; } + public string college_class { get; set; } + public string college_create_time { get; set; } + public string master_num { get; set; } + public string doctor_num { get; set; } + public string college_nature { get; set; } + public string college_type { get; set; } + public string college_area { get; set; } + public string college_competent { get; set; } + public string[] college_grade { get; set; } + public object wsl_ranking { get; set; } + public string rk_ranking { get; set; } + public string enrollment_rate_in { get; set; } + public string enrollment_rate_out { get; set; } + public string job_report { get; set; } +} + + + + + + +public class RecruitRootobject +{ + public int code { get; set; } + public string msg { get; set; } + public RecruitData data { get; set; } +} + +public class RecruitData +{ + public Tag tag { get; set; } + public List recruit { get; set; } +} + +public class Tag +{ + public string id { get; set; } + public string college_name { get; set; } + public string college_logo { get; set; } + public string ex_rate { get; set; } + public string college_class { get; set; } + public string college_create_time { get; set; } + public string master_num { get; set; } + public string doctor_num { get; set; } + public string college_nature { get; set; } + public string college_type { get; set; } + public string college_area { get; set; } + public string college_competent { get; set; } + public string[] college_grade { get; set; } + public int wsl_ranking { get; set; } + public string rk_ranking { get; set; } + public string enrollment_rate_in { get; set; } + public string enrollment_rate_out { get; set; } + public string job_report { get; set; } +} + +public class Recruit +{ + public int id { get; set; } + public int base_college_id { get; set; } + public string year { get; set; } + public string college_name { get; set; } + public string title { get; set; } +} diff --git a/New_Spider/xuexingaokao/XueHtmlDto.cs b/New_Spider/xuexingaokao/XueHtmlDto.cs new file mode 100644 index 0000000..f40f905 --- /dev/null +++ b/New_Spider/xuexingaokao/XueHtmlDto.cs @@ -0,0 +1,12 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace New_Spider.xuexingaokao +{ + internal class XueHtmlDto + { + } +} diff --git a/New_Spider/HtmlAgHelper.cs b/New_Spider/阳光高考/HtmlAgHelper.cs similarity index 97% rename from New_Spider/HtmlAgHelper.cs rename to New_Spider/阳光高考/HtmlAgHelper.cs index 1eb2c00..82af96d 100644 --- a/New_Spider/HtmlAgHelper.cs +++ b/New_Spider/阳光高考/HtmlAgHelper.cs @@ -7,7 +7,7 @@ using System.Text; using System.Text.Json; using System.Threading.Tasks; -namespace New_Spider +namespace New_Spider.阳光高考 { public class HtmlAgHelper { @@ -37,7 +37,7 @@ namespace New_Spider Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText)); - HtmlUniversityAgResolve(doc, webClient, item, item.InnerText.Trim(),jsonobjs); + HtmlUniversityAgResolve(doc, webClient, item, item.InnerText.Trim(), jsonobjs); } // Console.WriteLine(k); Thread.Sleep(1000); @@ -62,7 +62,7 @@ namespace New_Spider /// /// /// - public void HtmlUniversityAgResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, HtmlNode htmlNode,string name, List jsons) + public void HtmlUniversityAgResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, HtmlNode htmlNode, string name, List jsons) { htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", htmlNode.Attributes["href"].Value)); diff --git a/New_Spider/HtmlAgMajorHelper.cs b/New_Spider/阳光高考/HtmlAgMajorHelper.cs similarity index 91% rename from New_Spider/HtmlAgMajorHelper.cs rename to New_Spider/阳光高考/HtmlAgMajorHelper.cs index d30e955..c285dab 100644 --- a/New_Spider/HtmlAgMajorHelper.cs +++ b/New_Spider/阳光高考/HtmlAgMajorHelper.cs @@ -9,7 +9,7 @@ using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; -namespace New_Spider +namespace New_Spider.阳光高考 { public class HtmlAgMajorHelper { @@ -47,8 +47,8 @@ namespace New_Spider { var basejson = UniversityTypeRelsove.GetFileJson(AppDomain.CurrentDomain.SetupInformation.ApplicationBase + "Files/1070.json"); - // var jsontext2 = Regex.Unescape(jsontext.Remove(jsontext.Length - 1, 1).Remove(0, 1)); - + // var jsontext2 = Regex.Unescape(jsontext.Remove(jsontext.Length - 1, 1).Remove(0, 1)); + var jsons = JsonConvert.DeserializeObject(basejson); string pathFile = AppDomain.CurrentDomain.SetupInformation.ApplicationBase + "Files/Type/"; //判断文件夹是否存在 @@ -61,18 +61,18 @@ namespace New_Spider var list = jsons.msg.ToList(); list.ForEach(c => { - - wc.DownloadFile(string.Format("https://gaokao.chsi.com.cn/zyk/zybk/xkCategory/{0}?_t=1694748927326", c.key), string.Format(pathFile + "{0}.json", c.key)); + + wc.DownloadFile(string.Format("https://gaokao.chsi.com.cn/zyk/zybk/xkCategory/{0}?_t=1694748927326", c.key), string.Format(pathFile + "{0}.json", c.key)); }); - } + } /// /// 下载三级分类 /// public void DownloadChildTypeListFile() { - + string[] files = Directory.GetFiles(AppDomain.CurrentDomain.SetupInformation.ApplicationBase + "Files/Type/"); foreach (string itemfile in files) { diff --git a/New_Spider/HtmlAgNewsHelper.cs b/New_Spider/阳光高考/HtmlAgNewsHelper.cs similarity index 88% rename from New_Spider/HtmlAgNewsHelper.cs rename to New_Spider/阳光高考/HtmlAgNewsHelper.cs index ee07fa1..df4eafd 100644 --- a/New_Spider/HtmlAgNewsHelper.cs +++ b/New_Spider/阳光高考/HtmlAgNewsHelper.cs @@ -8,7 +8,7 @@ using System.Net; using System.Text; using System.Threading.Tasks; -namespace New_Spider +namespace New_Spider.阳光高考 { public class HtmlAgNewsHelper { @@ -54,8 +54,8 @@ namespace New_Spider options.AddArgument("--headless"); options.AddArgument("--no-sandbox"); options.AddArgument("--disable-dev-shm-usage"); - String driverExecutableFileName = "chromedriver"; - String driverPath = "D:\\一草一木教育\\newgaokao\\New_Spider\\bin\\Debug\\net6.0\\"; + string driverExecutableFileName = "chromedriver"; + string driverPath = "D:\\一草一木教育\\newgaokao\\New_Spider\\bin\\Debug\\net6.0\\"; ChromeDriverService service = ChromeDriverService.CreateDefaultService(driverPath, driverExecutableFileName); var driver = new ChromeDriver(service, options); @@ -66,7 +66,7 @@ namespace New_Spider Thread.Sleep(1000); // 这里使用简单的等待,更好的做法是使用 WebDriverWait // 获取动态内容 - var content = driver.FindElement(By.XPath("//*[@id=\"app\"]/div[2]/div[2]")); + var content = driver.FindElement(By.XPath("//*[@id=\"app\"]/div[2]/div[2]")); // 也可以进一步获取子节点的子节点 @@ -75,8 +75,8 @@ namespace New_Spider { var grandChild = grandChildElement.FindElement(By.TagName("span")); var grandChildText = grandChild.Text; - var titletext= grandChildElement.FindElement(By.TagName("a")).Text; - var aurl= grandChildElement.FindElement(By.TagName("a")).GetAttribute("href"); + var titletext = grandChildElement.FindElement(By.TagName("a")).Text; + var aurl = grandChildElement.FindElement(By.TagName("a")).GetAttribute("href"); Console.WriteLine($"Grandchild Element Text: {grandChildText}"); }