150 lines
5.6 KiB
C#
150 lines
5.6 KiB
C#
using Aliyun.OSS;
|
||
using HtmlAgilityPack;
|
||
using ICSharpCode.SharpZipLib.Core;
|
||
using Microsoft.AspNetCore.Mvc.RazorPages;
|
||
using New_College.Common.Helper;
|
||
using OpenQA.Selenium.Chrome;
|
||
using OSS.Tools.Http;
|
||
using System;
|
||
using System.Collections.Generic;
|
||
using System.Linq;
|
||
using System.Net;
|
||
using System.Net.Http;
|
||
using System.Security.AccessControl;
|
||
using System.Text;
|
||
using System.Threading.Tasks;
|
||
|
||
namespace New_Spider
|
||
{
|
||
public class QingTingSpiderHelper
|
||
{
|
||
private string bucketName = "static-data-ycymedu"; //
|
||
private string filePrefix = "qingting-data/";
|
||
// 初始化 OSS 客户端
|
||
private OssClient ossClient = new OssClient("https://oss-cn-shanghai.aliyuncs.com", "LTAI5tKs3TXSbt7E4WMDcxwR", "EvC8MjRaQC1kHubgU4MtecZnofOb0v");
|
||
|
||
public async Task HtmlCreatePageData()
|
||
{
|
||
|
||
var options = new ChromeOptions();
|
||
options.AddArgument("--headless"); // 如果需要无界面模式
|
||
options.AddArgument("--disable-gpu");
|
||
options.AddArgument("--no-sandbox");
|
||
var list = new List<BaseZProfessionDto>();
|
||
using (var driver = new ChromeDriver(options))
|
||
{
|
||
// 导航到目标页面
|
||
driver.Navigate().GoToUrl("https://www.qingtingai.net/inquiryProfession");
|
||
|
||
// 获取页面内容
|
||
var pageSource = driver.PageSource;
|
||
|
||
// 加载 HTML 内容
|
||
HtmlDocument doc = new HtmlDocument();
|
||
doc.LoadHtml(pageSource);
|
||
|
||
|
||
var topCategoryNodes = doc.DocumentNode.SelectNodes("//div[contains(@class, 'Top-Category cateMargin')]");
|
||
|
||
if (topCategoryNodes != null)
|
||
{
|
||
foreach (var topCategory in topCategoryNodes)
|
||
{
|
||
// 提取主类标题
|
||
// 调整 XPath,宽松匹配 class 值
|
||
var titleNode = topCategory.SelectSingleNode(".//span[contains(@class, 'category-defult-title')]");
|
||
string title = titleNode?.InnerText.Trim() ?? "无标题";
|
||
|
||
// 提取职业信息
|
||
var jobNodes = topCategory.SelectNodes(".//ul/li");
|
||
Console.WriteLine($"分类: {title}");
|
||
var itemlist = new List<ZProfessionItemDto>();
|
||
if (jobNodes != null)
|
||
{
|
||
foreach (var job in jobNodes)
|
||
{
|
||
var jobName = job.InnerText.Trim();
|
||
var jobLink = job.SelectSingleNode(".//a")?.GetAttributeValue("href", "无链接");
|
||
Console.WriteLine($" 职业名称: {jobName}, 链接: {jobLink}");
|
||
if (jobLink.Contains("id="))
|
||
{
|
||
int idValue = int.Parse(jobLink.Split("id=")[1]);
|
||
itemlist.Add(new ZProfessionItemDto() { id = idValue, name = jobName });
|
||
}
|
||
}
|
||
}
|
||
list.Add(new BaseZProfessionDto() { RootName = title, itemDtos = itemlist });
|
||
}
|
||
}
|
||
else
|
||
{
|
||
Console.WriteLine("未找到匹配的分类!");
|
||
}
|
||
|
||
|
||
// 构建文件名和路径
|
||
string objectName = $"{filePrefix}list.json";
|
||
|
||
// 上传到 OSS(同步上传)
|
||
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(list.ToJson()));
|
||
ossClient.PutObject(bucketName, objectName, stream);
|
||
|
||
Console.WriteLine($"Uploaded page to OSS as {objectName}");
|
||
|
||
|
||
}
|
||
|
||
|
||
}
|
||
|
||
|
||
|
||
public void HtmltwlItemsData()
|
||
{
|
||
using var httpClient = new HttpClient();
|
||
|
||
var geturlresult = httpClient.GetAsync("https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/qingting-data/list.json").Result;
|
||
if (geturlresult.IsSuccessStatusCode)
|
||
{
|
||
var list = System.Text.Json.JsonSerializer.Deserialize<List<BaseZProfessionDto>>(geturlresult.ReadContentAsStringAsync().Result);
|
||
|
||
list.ForEach(item =>
|
||
{
|
||
item.itemDtos.ForEach(async a =>
|
||
{
|
||
var gourl = $"https://www.qingtingai.net/api/career/get_career_info?id={a.id}&agt_host=www.qingtingai.net¤t_host=www.qingtingai.net";
|
||
var response = httpClient.GetAsync(gourl).Result;
|
||
if (response.IsSuccessStatusCode)
|
||
{
|
||
var jsonData = response.Content.ReadAsStringAsync().Result;
|
||
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
|
||
// 构建文件名和路径
|
||
string objectName = $"{filePrefix}{a.id}.json";
|
||
ossClient.PutObject(bucketName, objectName, stream);
|
||
Console.WriteLine($"Uploaded to OSS as {objectName}");
|
||
Thread.Sleep(300);
|
||
}
|
||
});
|
||
});
|
||
}
|
||
}
|
||
|
||
}
|
||
|
||
public class BaseZProfessionDto
|
||
{
|
||
public string RootName { get; set; }
|
||
public List<ZProfessionItemDto> itemDtos { get; set; }
|
||
}
|
||
|
||
public class ZProfessionItemDto
|
||
{
|
||
public long id { get; set; }
|
||
|
||
public string name { get; set; }
|
||
|
||
}
|
||
|
||
|
||
}
|