NewGaoKaoApi/New_Spider/xuexingaokao/XueHtmlAgHelper.cs

538 lines
22 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

using Aliyun.OSS;
using HtmlAgilityPack;
using NPOI.Util.ArrayExtensions;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
namespace New_Spider
{
public class XueHtmlAgHelper
{
private string bucketName = "static-data-ycymedu"; //
private string filePrefix = "college-data/page-";
// 初始化 OSS 客户端
private OssClient ossClient = new OssClient("https://oss-cn-shanghai.aliyuncs.com", "LTAI5tKs3TXSbt7E4WMDcxwR", "EvC8MjRaQC1kHubgU4MtecZnofOb0v");
/// <summary>
/// 阳光高考完数据解析
/// </summary>
public async Task HtmlCreatePageData()
{
// API 配置
string baseUrl = "http://api.xuexingaokao.com/api/college/base_college/index";
int totalPages = 287;
string filePrefix = "college-data/page-";
using var httpClient = new HttpClient();
for (int page = 1; page <= totalPages; page++)
{
try
{
// 动态构建 API URL
var url = $"{baseUrl}?area=&type=&nature=&grade=&is_bz=&cname=&page={page}&perpage=10&timestamp={DateTimeOffset.Now.ToUnixTimeSeconds()}";
Console.WriteLine($"Fetching page {page}...");
// 获取 JSON 数据(同步请求)
var response = httpClient.GetAsync(url).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = response.Content.ReadAsStringAsync().Result;
// 构建文件名和路径
string objectName = $"{filePrefix}{page}.json";
// 上传到 OSS同步上传
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
ossClient.PutObject(bucketName, objectName, stream);
Console.WriteLine($"Uploaded page {page} to OSS as {objectName}");
}
else
{
Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}");
}
}
catch (Exception ex)
{
Console.WriteLine($"Error fetching or uploading page {page}: {ex.Message}");
}
}
}
public async Task GetDataDetail()
{
int totalPages = 287;
string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/";
string detailUrl = "http://api.xuexingaokao.com/api/college/base_college/collegeDetail?college_id=";
string facturl = "http://api.xuexingaokao.com/api/college/base_college/faculty?college_id=685&timestamp=1733820969";
string subjectIntroduceurl = "http://api.xuexingaokao.com/api/college/base_college/subjectIntroduce?college_id=685&timestamp=1733821065";
string recruitListUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitList?college_id=685&timestamp=1733821136";
using var httpClient = new HttpClient();
for (int page = 1; page <= totalPages; page++)
{ // 动态构建 API URL
var url = $"{baseUrl}{filePrefix}{page}.json";
Console.WriteLine($"Fetching page {page}...");
var response = httpClient.GetAsync(url).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = JsonSerializer.Deserialize<universitylistobject>(response.Content.ReadAsStringAsync().Result);
if (jsonData?.code == 1)
{
string filePrefix2 = "college-detail/";
jsonData.data.data.ToList().ForEach(a =>
{
// 动态构建 API URL
var detailgourl = $"{detailUrl}{a.id}&timestamp={DateTimeOffset.Now.ToUnixTimeSeconds()}";
Console.WriteLine($"detailgourl {a.id}...");
// 获取 JSON 数据(同步请求)
var response = httpClient.GetAsync(detailgourl).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = response.Content.ReadAsStringAsync().Result;
// 构建文件名和路径
string objectName = $"{filePrefix2}{a.id}.json";
// 上传到 OSS同步上传
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
ossClient.PutObject(bucketName, objectName, stream);
Console.WriteLine($"Uploaded page {page} to OSS as {objectName}");
}
else
{
Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}");
}
});
}
}
}
}
/// <summary>
/// 缺失详情页
/// </summary>
public void GetDataRecruitList()
{
int totalPages = 287;
string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/";
string recruitListUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitList?college_id=";
using var httpClient = new HttpClient();
for (int page = 1; page <= totalPages; page++)
{ // 动态构建 API URL
var url = $"{baseUrl}{filePrefix}{page}.json";
Console.WriteLine($"recruitListUrl page {page}...");
var response = httpClient.GetAsync(url).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = JsonSerializer.Deserialize<universitylistobject>(response.Content.ReadAsStringAsync().Result);
if (jsonData?.code == 1)
{
string filePrefix2 = "college-recruitlist/";
jsonData.data.data.ToList().ForEach(a =>
{
Thread.Sleep(200);
// 动态构建 API URL
var gourl = $"{recruitListUrl}{a.id}&timestamp={DateTimeOffset.Now.ToUnixTimeSeconds()}";
Console.WriteLine($"recruitListUrl {a.id}...");
// 获取 JSON 数据(同步请求)
var response = httpClient.GetAsync(gourl).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = response.Content.ReadAsStringAsync().Result;
// 构建文件名和路径
string objectName = $"{filePrefix2}{a.id}.json";
// 上传到 OSS同步上传
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
ossClient.PutObject(bucketName, objectName, stream);
Console.WriteLine($"Uploaded page {page} to OSS as {objectName}");
}
else
{
Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}");
}
});
}
}
}
}
public void GetDataSubjectIntroduce()
{
int totalPages = 287;
string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/";
string subjectIntroduceurl = "http://api.xuexingaokao.com/api/college/base_college/subjectIntroduce?college_id=";
using var httpClient = new HttpClient();
for (int page = 1; page <= totalPages; page++)
{ // 动态构建 API URL
var url = $"{baseUrl}{filePrefix}{page}.json";
Console.WriteLine($"subjectIntroduceurl page {page}...");
var response = httpClient.GetAsync(url).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = JsonSerializer.Deserialize<universitylistobject>(response.Content.ReadAsStringAsync().Result);
if (jsonData?.code == 1)
{
string filePrefix2 = "college-subjectIntroduce/";
jsonData.data.data.ToList().ForEach(a =>
{
Thread.Sleep(200);
// 动态构建 API URL
var gourl = $"{subjectIntroduceurl}{a.id}&timestamp={DateTimeOffset.Now.ToUnixTimeSeconds()}";
Console.WriteLine($"subjectIntroduceurl {a.id}...");
// 获取 JSON 数据(同步请求)
var response = httpClient.GetAsync(gourl).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = response.Content.ReadAsStringAsync().Result;
// 构建文件名和路径
string objectName = $"{filePrefix2}{a.id}.json";
// 上传到 OSS同步上传
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
ossClient.PutObject(bucketName, objectName, stream);
Console.WriteLine($"Uploaded page {page} to OSS as {objectName}");
}
else
{
Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}");
}
});
}
}
}
}
/// <summary>
///
/// </summary>
/// <returns></returns>
public void GetDataFaculty()
{
int totalPages = 287;
string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/";
string facturl = "http://api.xuexingaokao.com/api/college/base_college/faculty?college_id=";
using var httpClient = new HttpClient();
for (int page = 1; page <= totalPages; page++)
{ // 动态构建 API URL
var url = $"{baseUrl}{filePrefix}{page}.json";
Console.WriteLine($"Fetching page {page}...");
var response = httpClient.GetAsync(url).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = JsonSerializer.Deserialize<universitylistobject>(response.Content.ReadAsStringAsync().Result);
if (jsonData?.code == 1)
{
string filePrefix2 = "college-faculty/";
jsonData.data.data.ToList().ForEach(a =>
{
Thread.Sleep(300);
// 动态构建 API URL
var gourl = $"{facturl}{a.id}&timestamp={DateTimeOffset.Now.ToUnixTimeSeconds()}";
Console.WriteLine($"facultyurl {a.id}...");
// 获取 JSON 数据(同步请求)
var response = httpClient.GetAsync(gourl).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = response.Content.ReadAsStringAsync().Result;
// 构建文件名和路径
string objectName = $"{filePrefix2}{a.id}.json";
// 上传到 OSS同步上传
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
ossClient.PutObject(bucketName, objectName, stream);
Console.WriteLine($"Uploaded page {page} to OSS as {objectName}");
}
else
{
Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}");
}
});
}
}
}
}
/// <summary>
/// 招生简介详情
/// </summary>
public void GetDataRecruitDetail()
{
int totalPages = 287;
string baseUrl = "https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/";
string recruitUrl = "http://api.xuexingaokao.com/api/college/base_college/recruitDetail?recruit_id=";
using var httpClient = new HttpClient();
string filePrefix2 = "college-recruitlist/";
string filePrefix3 = "college-recruitdetail/";
for (int page = 1; page <= totalPages; page++)
{ // 动态构建 API URL
var url = $"{baseUrl}{filePrefix}{page}.json";
Console.WriteLine($"recruitListUrl page {page}...");
var response = httpClient.GetAsync(url).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = JsonSerializer.Deserialize<universitylistobject>(response.Content.ReadAsStringAsync().Result);
if (jsonData?.code == 1)
{
jsonData.data.data.ToList().ForEach(a =>
{
string objectUrl = $"{baseUrl}{filePrefix2}{a.id}.json";
Console.WriteLine($"down page {objectUrl}");
var responseitem = httpClient.GetAsync(objectUrl).Result;
if (responseitem.IsSuccessStatusCode)
{
try
{
var items = JsonSerializer.Deserialize<RecruitRootobject>(responseitem.Content.ReadAsStringAsync().Result);
if (items != null)
{
if (items.code == 1)
{
items.data.recruit.ToList().ForEach(e =>
{
Thread.Sleep(150);
try
{
var itemGoUrl = $"{recruitUrl}{e.id}&timestamp={DateTimeOffset.Now.ToUnixTimeSeconds()}";
var response = httpClient.GetAsync(itemGoUrl).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = response.Content.ReadAsStringAsync().Result;
// 构建文件名和路径
string objectName = $"{filePrefix3}{e.id}.json";
// 上传到 OSS同步上传
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
ossClient.PutObject(bucketName, objectName, stream);
Console.WriteLine($"Uploaded page {page} to OSS as {objectName}");
}
else
{
Console.WriteLine($"Failed to fetch page {page}, Status Code: {response.StatusCode}");
}
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
});
}
}
}
catch (Exception exx)
{
Console.WriteLine(exx.Message);
Console.WriteLine($"{objectUrl}解析失败");
}
}
});
}
}
}
}
}
}
public class universitylistobject
{
public int code { get; set; }
public string msg { get; set; }
public Data data { get; set; }
}
public class Data
{
public int total { get; set; }
public string has_next { get; set; }
public string page { get; set; }
public string perpage { get; set; }
public Params _params { get; set; }
public Paramslist paramsList { get; set; }
public Datum[] data { get; set; }
}
public class Params
{
public string area { get; set; }
public string type { get; set; }
public string nature { get; set; }
public string grade { get; set; }
public string is_bz { get; set; }
public string cname { get; set; }
public string page { get; set; }
public string perpage { get; set; }
public string timestamp { get; set; }
}
public class Paramslist
{
public Arealist[] arealist { get; set; }
public Typelist[] typelist { get; set; }
public Naturelist[] naturelist { get; set; }
public Xingzhilist[] xingzhilist { get; set; }
public Classlist[] classlist { get; set; }
}
public class Arealist
{
public int id { get; set; }
public string area { get; set; }
public string province { get; set; }
public string city { get; set; }
public int create_time { get; set; }
public string text { get; set; }
public Child[] children { get; set; }
}
public class Child
{
public int id { get; set; }
public string area { get; set; }
public string province { get; set; }
public string city { get; set; }
public int create_time { get; set; }
public string text { get; set; }
}
public class Typelist
{
public int id { get; set; }
public string type_name { get; set; }
public int create_time { get; set; }
public int list_order { get; set; }
}
public class Naturelist
{
public int id { get; set; }
public string nature_name { get; set; }
public int create_time { get; set; }
public int list_order { get; set; }
public int staue { get; set; }
}
public class Xingzhilist
{
public int id { get; set; }
public string nature_name { get; set; }
public int create_time { get; set; }
public int list_order { get; set; }
public int staue { get; set; }
}
public class Classlist
{
public int id { get; set; }
public string nature_name { get; set; }
public int create_time { get; set; }
public int list_order { get; set; }
public int staue { get; set; }
}
public class Datum
{
public int id { get; set; }
public string college_name { get; set; }
public string college_name_display { get; set; }
public string rk_ranking_2024 { get; set; }
public int rk_ranking_list { get; set; }
public string college_class { get; set; }
public string college_nature { get; set; }
public Detail detail { get; set; }
public string rk_ranking { get; set; }
}
public class Detail
{
public int id { get; set; }
public string college_name { get; set; }
public string college_logo { get; set; }
public object ex_rate { get; set; }
public string college_class { get; set; }
public string college_create_time { get; set; }
public string master_num { get; set; }
public string doctor_num { get; set; }
public string college_nature { get; set; }
public string college_type { get; set; }
public string college_area { get; set; }
public string college_competent { get; set; }
public string[] college_grade { get; set; }
public object wsl_ranking { get; set; }
public string rk_ranking { get; set; }
public string enrollment_rate_in { get; set; }
public string enrollment_rate_out { get; set; }
public string job_report { get; set; }
}
public class RecruitRootobject
{
public int code { get; set; }
public string msg { get; set; }
public RecruitData data { get; set; }
}
public class RecruitData
{
public Tag tag { get; set; }
public List<Recruit> recruit { get; set; }
}
public class Tag
{
public string id { get; set; }
public string college_name { get; set; }
public string college_logo { get; set; }
public string ex_rate { get; set; }
public string college_class { get; set; }
public string college_create_time { get; set; }
public string master_num { get; set; }
public string doctor_num { get; set; }
public string college_nature { get; set; }
public string college_type { get; set; }
public string college_area { get; set; }
public string college_competent { get; set; }
public string[] college_grade { get; set; }
public int wsl_ranking { get; set; }
public string rk_ranking { get; set; }
public string enrollment_rate_in { get; set; }
public string enrollment_rate_out { get; set; }
public string job_report { get; set; }
}
public class Recruit
{
public int id { get; set; }
public int base_college_id { get; set; }
public string year { get; set; }
public string college_name { get; set; }
public string title { get; set; }
}