using Aliyun.OSS; using HtmlAgilityPack; using ICSharpCode.SharpZipLib.Core; using Microsoft.AspNetCore.Mvc.RazorPages; using New_College.Common.Helper; using OpenQA.Selenium.Chrome; using OSS.Tools.Http; using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Net.Http; using System.Security.AccessControl; using System.Text; using System.Threading.Tasks; namespace New_Spider { public class QingTingSpiderHelper { private string bucketName = "static-data-ycymedu"; // private string filePrefix = "qingting-data/"; // 初始化 OSS 客户端 private OssClient ossClient = new OssClient("https://oss-cn-shanghai.aliyuncs.com", "LTAI5tKs3TXSbt7E4WMDcxwR", "EvC8MjRaQC1kHubgU4MtecZnofOb0v"); public async Task HtmlCreatePageData() { var options = new ChromeOptions(); options.AddArgument("--headless"); // 如果需要无界面模式 options.AddArgument("--disable-gpu"); options.AddArgument("--no-sandbox"); var list = new List(); using (var driver = new ChromeDriver(options)) { // 导航到目标页面 driver.Navigate().GoToUrl("https://www.qingtingai.net/inquiryProfession"); // 获取页面内容 var pageSource = driver.PageSource; // 加载 HTML 内容 HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(pageSource); var topCategoryNodes = doc.DocumentNode.SelectNodes("//div[contains(@class, 'Top-Category cateMargin')]"); if (topCategoryNodes != null) { foreach (var topCategory in topCategoryNodes) { // 提取主类标题 // 调整 XPath,宽松匹配 class 值 var titleNode = topCategory.SelectSingleNode(".//span[contains(@class, 'category-defult-title')]"); string title = titleNode?.InnerText.Trim() ?? "无标题"; // 提取职业信息 var jobNodes = topCategory.SelectNodes(".//ul/li"); Console.WriteLine($"分类: {title}"); var itemlist = new List(); if (jobNodes != null) { foreach (var job in jobNodes) { var jobName = job.InnerText.Trim(); var jobLink = job.SelectSingleNode(".//a")?.GetAttributeValue("href", "无链接"); Console.WriteLine($" 职业名称: {jobName}, 链接: {jobLink}"); if (jobLink.Contains("id=")) { int idValue = int.Parse(jobLink.Split("id=")[1]); itemlist.Add(new ZProfessionItemDto() { id = idValue, name = jobName }); } } } list.Add(new BaseZProfessionDto() { RootName = title, itemDtos = itemlist }); } } else { Console.WriteLine("未找到匹配的分类!"); } // 构建文件名和路径 string objectName = $"{filePrefix}list.json"; // 上传到 OSS(同步上传) using var stream = new MemoryStream(Encoding.UTF8.GetBytes(list.ToJson())); ossClient.PutObject(bucketName, objectName, stream); Console.WriteLine($"Uploaded page to OSS as {objectName}"); } } public void HtmltwlItemsData() { using var httpClient = new HttpClient(); var geturlresult = httpClient.GetAsync("https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/qingting-data/list.json").Result; if (geturlresult.IsSuccessStatusCode) { var list = System.Text.Json.JsonSerializer.Deserialize>(geturlresult.ReadContentAsStringAsync().Result); list.ForEach(item => { item.itemDtos.ForEach(async a => { var gourl = $"https://www.qingtingai.net/api/career/get_career_info?id={a.id}&agt_host=www.qingtingai.net¤t_host=www.qingtingai.net"; var response = httpClient.GetAsync(gourl).Result; if (response.IsSuccessStatusCode) { var jsonData = response.Content.ReadAsStringAsync().Result; using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData)); // 构建文件名和路径 string objectName = $"{filePrefix}{a.id}.json"; ossClient.PutObject(bucketName, objectName, stream); Console.WriteLine($"Uploaded to OSS as {objectName}"); Thread.Sleep(300); } }); }); } } } public class BaseZProfessionDto { public string RootName { get; set; } public List itemDtos { get; set; } } public class ZProfessionItemDto { public long id { get; set; } public string name { get; set; } } }