develop
old易 2024-12-26 17:03:47 +08:00
parent 99116a3ca8
commit 19b0f1727c
3 changed files with 154 additions and 3 deletions

View File

@ -9,8 +9,8 @@
<ItemGroup> <ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.11.53" /> <PackageReference Include="HtmlAgilityPack" Version="1.11.53" />
<PackageReference Include="Selenium.WebDriver" Version="4.18.1" /> <PackageReference Include="Selenium.WebDriver" Version="4.27.0" />
<PackageReference Include="Selenium.WebDriver.ChromeDriver" Version="122.0.6261.9400" /> <PackageReference Include="Selenium.WebDriver.ChromeDriver" Version="131.0.6778.20400" />
</ItemGroup> </ItemGroup>

View File

@ -28,7 +28,9 @@ using System.Text.RegularExpressions;
// See https://aka.ms/new-console-template for more information // See https://aka.ms/new-console-template for more information
XueHtmlAgHelper xueHtmlAg = new XueHtmlAgHelper(); //XueHtmlAgHelper xueHtmlAg = new XueHtmlAgHelper();
QingTingSpiderHelper tingSpiderHelper=new QingTingSpiderHelper();
tingSpiderHelper.HtmltwlItemsData();
//xueHtmlAg.GetDataRecruitDetail(); //xueHtmlAg.GetDataRecruitDetail();
Console.WriteLine("success!"); Console.WriteLine("success!");
Console.Read(); Console.Read();

View File

@ -0,0 +1,149 @@
using Aliyun.OSS;
using HtmlAgilityPack;
using ICSharpCode.SharpZipLib.Core;
using Microsoft.AspNetCore.Mvc.RazorPages;
using New_College.Common.Helper;
using OpenQA.Selenium.Chrome;
using OSS.Tools.Http;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Security.AccessControl;
using System.Text;
using System.Threading.Tasks;
namespace New_Spider
{
public class QingTingSpiderHelper
{
private string bucketName = "static-data-ycymedu"; //
private string filePrefix = "qingting-data/";
// 初始化 OSS 客户端
private OssClient ossClient = new OssClient("https://oss-cn-shanghai.aliyuncs.com", "LTAI5tKs3TXSbt7E4WMDcxwR", "EvC8MjRaQC1kHubgU4MtecZnofOb0v");
public async Task HtmlCreatePageData()
{
var options = new ChromeOptions();
options.AddArgument("--headless"); // 如果需要无界面模式
options.AddArgument("--disable-gpu");
options.AddArgument("--no-sandbox");
var list = new List<BaseZProfessionDto>();
using (var driver = new ChromeDriver(options))
{
// 导航到目标页面
driver.Navigate().GoToUrl("https://www.qingtingai.net/inquiryProfession");
// 获取页面内容
var pageSource = driver.PageSource;
// 加载 HTML 内容
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(pageSource);
var topCategoryNodes = doc.DocumentNode.SelectNodes("//div[contains(@class, 'Top-Category cateMargin')]");
if (topCategoryNodes != null)
{
foreach (var topCategory in topCategoryNodes)
{
// 提取主类标题
// 调整 XPath宽松匹配 class 值
var titleNode = topCategory.SelectSingleNode(".//span[contains(@class, 'category-defult-title')]");
string title = titleNode?.InnerText.Trim() ?? "无标题";
// 提取职业信息
var jobNodes = topCategory.SelectNodes(".//ul/li");
Console.WriteLine($"分类: {title}");
var itemlist = new List<ZProfessionItemDto>();
if (jobNodes != null)
{
foreach (var job in jobNodes)
{
var jobName = job.InnerText.Trim();
var jobLink = job.SelectSingleNode(".//a")?.GetAttributeValue("href", "无链接");
Console.WriteLine($" 职业名称: {jobName}, 链接: {jobLink}");
if (jobLink.Contains("id="))
{
int idValue = int.Parse(jobLink.Split("id=")[1]);
itemlist.Add(new ZProfessionItemDto() { id = idValue, name = jobName });
}
}
}
list.Add(new BaseZProfessionDto() { RootName = title, itemDtos = itemlist });
}
}
else
{
Console.WriteLine("未找到匹配的分类!");
}
// 构建文件名和路径
string objectName = $"{filePrefix}list.json";
// 上传到 OSS同步上传
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(list.ToJson()));
ossClient.PutObject(bucketName, objectName, stream);
Console.WriteLine($"Uploaded page to OSS as {objectName}");
}
}
public void HtmltwlItemsData()
{
using var httpClient = new HttpClient();
var geturlresult = httpClient.GetAsync("https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/qingting-data/list.json").Result;
if (geturlresult.IsSuccessStatusCode)
{
var list = System.Text.Json.JsonSerializer.Deserialize<List<BaseZProfessionDto>>(geturlresult.ReadContentAsStringAsync().Result);
list.ForEach(item =>
{
item.itemDtos.ForEach(async a =>
{
var gourl = $"https://www.qingtingai.net/api/career/get_career_info?id={a.id}&agt_host=www.qingtingai.net&current_host=www.qingtingai.net";
var response = httpClient.GetAsync(gourl).Result;
if (response.IsSuccessStatusCode)
{
var jsonData = response.Content.ReadAsStringAsync().Result;
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
// 构建文件名和路径
string objectName = $"{filePrefix}{a.id}.json";
ossClient.PutObject(bucketName, objectName, stream);
Console.WriteLine($"Uploaded to OSS as {objectName}");
Thread.Sleep(300);
}
});
});
}
}
}
public class BaseZProfessionDto
{
public string RootName { get; set; }
public List<ZProfessionItemDto> itemDtos { get; set; }
}
public class ZProfessionItemDto
{
public long id { get; set; }
public string name { get; set; }
}
}