using Aliyun.OSS; using HtmlAgilityPack; using NPOI.Util.ArrayExtensions; using System; using System.Collections.Generic; using System.Linq; using System.Net.Http; using System.Text; using System.Text.Json; using System.Threading.Tasks; namespace New_Spider { public class XueHtmlAgHelper { private string bucketName = "static-data-ycymedu"; // private string filePrefix = "college-data/page-"; // 初始化 OSS 客户端 private OssClient ossClient = new OssClient("https://oss-cn-shanghai.aliyuncs.com", "LTAI5tKs3TXSbt7E4WMDcxwR", "EvC8MjRaQC1kHubgU4MtecZnofOb0v"); /// /// 阳光高考完数据解析 /// public async Task HtmlCreatePageData() { // API 配置 string baseUrl = "http://api.xuexingaokao.com/api/college/base_college/index"; int totalPages = 287; string filePrefix = "college-data/page-"; using var httpClient = new HttpClient(); for (int page = 1; page <= totalPages; page++) { try { // 动态构建 API URL var url = $"{baseUrl}?area=&type=&nature=&grade=&is_bz=&cname=&page={page}&perpage=10×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; Console.WriteLine($"Fetching page {page}..."); // 获取 JSON 数据(同步请求) var response = httpClient.GetAsync(url).Result; if (response.IsSuccessStatusCode) { var jsonData = response.Content.ReadAsStringAsync().Result; // 构建文件名和路径 string objectName = $"{filePrefix}{page}.json"; // 上传到 OSS(同步上传) using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); ossClient.PutObject(bucketName, objectName, stream); Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); } else { Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); } } catch (Exception ex) { Console.WriteLine($"Error fetching or uploading page {page}: {ex.Message}"); } } } public async Task GetDataDetail() { int totalPages = 287; string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; string detailUrl = "http://api.xuexingaokao.com/api/college/base_college/collegeDetail?college_id="; string facturl = "http://api.xuexingaokao.com/api/college/base_college/faculty?college_id=685×tamp=1733820969"; string subjectIntroduceurl = "http://api.xuexingaokao.com/api/college/base_college/subjectIntroduce?college_id=685×tamp=1733821065"; string recruitListUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitList?college_id=685×tamp=1733821136"; using var httpClient = new HttpClient(); for (int page = 1; page <= totalPages; page++) { // 动态构建 API URL var url = $"{baseUrl}{filePrefix}{page}.json"; Console.WriteLine($"Fetching page {page}..."); var response = httpClient.GetAsync(url).Result; if (response.IsSuccessStatusCode) { var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); if (jsonData?.code == 1) { string filePrefix2 = "college-detail/"; jsonData.data.data.ToList().ForEach(a => { // 动态构建 API URL var detailgourl = $"{detailUrl}{a.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; Console.WriteLine($"detailgourl {a.id}..."); // 获取 JSON 数据(同步请求) var response = httpClient.GetAsync(detailgourl).Result; if (response.IsSuccessStatusCode) { var jsonData = response.Content.ReadAsStringAsync().Result; // 构建文件名和路径 string objectName = $"{filePrefix2}{a.id}.json"; // 上传到 OSS(同步上传) using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); ossClient.PutObject(bucketName, objectName, stream); Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); } else { Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); } }); } } } } /// /// 缺失详情页 /// public void GetDataRecruitList() { int totalPages = 287; string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; string recruitListUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitList?college_id="; using var httpClient = new HttpClient(); for (int page = 1; page <= totalPages; page++) { // 动态构建 API URL var url = $"{baseUrl}{filePrefix}{page}.json"; Console.WriteLine($"recruitListUrl page {page}..."); var response = httpClient.GetAsync(url).Result; if (response.IsSuccessStatusCode) { var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); if (jsonData?.code == 1) { string filePrefix2 = "college-recruitlist/"; jsonData.data.data.ToList().ForEach(a => { Thread.Sleep(200); // 动态构建 API URL var gourl = $"{recruitListUrl}{a.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; Console.WriteLine($"recruitListUrl {a.id}..."); // 获取 JSON 数据(同步请求) var response = httpClient.GetAsync(gourl).Result; if (response.IsSuccessStatusCode) { var jsonData = response.Content.ReadAsStringAsync().Result; // 构建文件名和路径 string objectName = $"{filePrefix2}{a.id}.json"; // 上传到 OSS(同步上传) using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); ossClient.PutObject(bucketName, objectName, stream); Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); } else { Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); } }); } } } } public void GetDataSubjectIntroduce() { int totalPages = 287; string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; string subjectIntroduceurl = "http://api.xuexingaokao.com/api/college/base_college/subjectIntroduce?college_id="; using var httpClient = new HttpClient(); for (int page = 1; page <= totalPages; page++) { // 动态构建 API URL var url = $"{baseUrl}{filePrefix}{page}.json"; Console.WriteLine($"subjectIntroduceurl page {page}..."); var response = httpClient.GetAsync(url).Result; if (response.IsSuccessStatusCode) { var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); if (jsonData?.code == 1) { string filePrefix2 = "college-subjectIntroduce/"; jsonData.data.data.ToList().ForEach(a => { Thread.Sleep(200); // 动态构建 API URL var gourl = $"{subjectIntroduceurl}{a.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; Console.WriteLine($"subjectIntroduceurl {a.id}..."); // 获取 JSON 数据(同步请求) var response = httpClient.GetAsync(gourl).Result; if (response.IsSuccessStatusCode) { var jsonData = response.Content.ReadAsStringAsync().Result; // 构建文件名和路径 string objectName = $"{filePrefix2}{a.id}.json"; // 上传到 OSS(同步上传) using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); ossClient.PutObject(bucketName, objectName, stream); Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); } else { Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); } }); } } } } /// /// /// /// public void GetDataFaculty() { int totalPages = 287; string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; string facturl = "http://api.xuexingaokao.com/api/college/base_college/faculty?college_id="; using var httpClient = new HttpClient(); for (int page = 1; page <= totalPages; page++) { // 动态构建 API URL var url = $"{baseUrl}{filePrefix}{page}.json"; Console.WriteLine($"Fetching page {page}..."); var response = httpClient.GetAsync(url).Result; if (response.IsSuccessStatusCode) { var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); if (jsonData?.code == 1) { string filePrefix2 = "college-faculty/"; jsonData.data.data.ToList().ForEach(a => { Thread.Sleep(300); // 动态构建 API URL var gourl = $"{facturl}{a.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; Console.WriteLine($"facultyurl {a.id}..."); // 获取 JSON 数据(同步请求) var response = httpClient.GetAsync(gourl).Result; if (response.IsSuccessStatusCode) { var jsonData = response.Content.ReadAsStringAsync().Result; // 构建文件名和路径 string objectName = $"{filePrefix2}{a.id}.json"; // 上传到 OSS(同步上传) using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); ossClient.PutObject(bucketName, objectName, stream); Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); } else { Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); } }); } } } } /// /// 招生简介详情 /// public void GetDataRecruitDetail() { int totalPages = 287; string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/"; string recruitUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitDetail?recruit_id="; using var httpClient = new HttpClient(); string filePrefix2 = "college-recruitlist/"; string filePrefix3 = "college-recruitdetail/"; for (int page = 1; page <= totalPages; page++) { // 动态构建 API URL var url = $"{baseUrl}{filePrefix}{page}.json"; Console.WriteLine($"recruitListUrl page {page}..."); var response = httpClient.GetAsync(url).Result; if (response.IsSuccessStatusCode) { var jsonData = JsonSerializer.Deserialize(response.Content.ReadAsStringAsync().Result); if (jsonData?.code == 1) { jsonData.data.data.ToList().ForEach(a => { string objectUrl = $"{baseUrl}{filePrefix2}{a.id}.json"; Console.WriteLine($"down page {objectUrl}"); var responseitem = httpClient.GetAsync(objectUrl).Result; if (responseitem.IsSuccessStatusCode) { try { var items = JsonSerializer.Deserialize(responseitem.Content.ReadAsStringAsync().Result); if (items != null) { if (items.code == 1) { items.data.recruit.ToList().ForEach(e => { Thread.Sleep(150); try { var itemGoUrl = $"{recruitUrl}{e.id}×tamp={DateTimeOffset.Now.ToUnixTimeSeconds()}"; var response = httpClient.GetAsync(itemGoUrl).Result; if (response.IsSuccessStatusCode) { var jsonData = response.Content.ReadAsStringAsync().Result; // 构建文件名和路径 string objectName = $"{filePrefix3}{e.id}.json"; // 上传到 OSS(同步上传) using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); ossClient.PutObject(bucketName, objectName, stream); Console.WriteLine($"Uploaded page {page} to OSS as {objectName}"); } else { Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}"); } } catch (Exception ex) { Console.WriteLine(ex.Message); } }); } } } catch (Exception exx) { Console.WriteLine(exx.Message); Console.WriteLine($"{objectUrl}解析失败"); } } }); } } } } } } public class universitylistobject { public int code { get; set; } public string msg { get; set; } public Data data { get; set; } } public class Data { public int total { get; set; } public string has_next { get; set; } public string page { get; set; } public string perpage { get; set; } public Params _params { get; set; } public Paramslist paramsList { get; set; } public Datum[] data { get; set; } } public class Params { public string area { get; set; } public string type { get; set; } public string nature { get; set; } public string grade { get; set; } public string is_bz { get; set; } public string cname { get; set; } public string page { get; set; } public string perpage { get; set; } public string timestamp { get; set; } } public class Paramslist { public Arealist[] arealist { get; set; } public Typelist[] typelist { get; set; } public Naturelist[] naturelist { get; set; } public Xingzhilist[] xingzhilist { get; set; } public Classlist[] classlist { get; set; } } public class Arealist { public int id { get; set; } public string area { get; set; } public string province { get; set; } public string city { get; set; } public int create_time { get; set; } public string text { get; set; } public Child[] children { get; set; } } public class Child { public int id { get; set; } public string area { get; set; } public string province { get; set; } public string city { get; set; } public int create_time { get; set; } public string text { get; set; } } public class Typelist { public int id { get; set; } public string type_name { get; set; } public int create_time { get; set; } public int list_order { get; set; } } public class Naturelist { public int id { get; set; } public string nature_name { get; set; } public int create_time { get; set; } public int list_order { get; set; } public int staue { get; set; } } public class Xingzhilist { public int id { get; set; } public string nature_name { get; set; } public int create_time { get; set; } public int list_order { get; set; } public int staue { get; set; } } public class Classlist { public int id { get; set; } public string nature_name { get; set; } public int create_time { get; set; } public int list_order { get; set; } public int staue { get; set; } } public class Datum { public int id { get; set; } public string college_name { get; set; } public string college_name_display { get; set; } public string rk_ranking_2024 { get; set; } public int rk_ranking_list { get; set; } public string college_class { get; set; } public string college_nature { get; set; } public Detail detail { get; set; } public string rk_ranking { get; set; } } public class Detail { public int id { get; set; } public string college_name { get; set; } public string college_logo { get; set; } public object ex_rate { get; set; } public string college_class { get; set; } public string college_create_time { get; set; } public string master_num { get; set; } public string doctor_num { get; set; } public string college_nature { get; set; } public string college_type { get; set; } public string college_area { get; set; } public string college_competent { get; set; } public string[] college_grade { get; set; } public object wsl_ranking { get; set; } public string rk_ranking { get; set; } public string enrollment_rate_in { get; set; } public string enrollment_rate_out { get; set; } public string job_report { get; set; } } public class RecruitRootobject { public int code { get; set; } public string msg { get; set; } public RecruitData data { get; set; } } public class RecruitData { public Tag tag { get; set; } public List recruit { get; set; } } public class Tag { public string id { get; set; } public string college_name { get; set; } public string college_logo { get; set; } public string ex_rate { get; set; } public string college_class { get; set; } public string college_create_time { get; set; } public string master_num { get; set; } public string doctor_num { get; set; } public string college_nature { get; set; } public string college_type { get; set; } public string college_area { get; set; } public string college_competent { get; set; } public string[] college_grade { get; set; } public int wsl_ranking { get; set; } public string rk_ranking { get; set; } public string enrollment_rate_in { get; set; } public string enrollment_rate_out { get; set; } public string job_report { get; set; } } public class Recruit { public int id { get; set; } public int base_college_id { get; set; } public string year { get; set; } public string college_name { get; set; } public string title { get; set; } }