111
parent
99116a3ca8
commit
19b0f1727c
|
|
@ -9,8 +9,8 @@
|
|||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="HtmlAgilityPack" Version="1.11.53" />
|
||||
<PackageReference Include="Selenium.WebDriver" Version="4.18.1" />
|
||||
<PackageReference Include="Selenium.WebDriver.ChromeDriver" Version="122.0.6261.9400" />
|
||||
<PackageReference Include="Selenium.WebDriver" Version="4.27.0" />
|
||||
<PackageReference Include="Selenium.WebDriver.ChromeDriver" Version="131.0.6778.20400" />
|
||||
|
||||
</ItemGroup>
|
||||
|
||||
|
|
|
|||
|
|
@ -28,7 +28,9 @@ using System.Text.RegularExpressions;
|
|||
// See https://aka.ms/new-console-template for more information
|
||||
|
||||
|
||||
XueHtmlAgHelper xueHtmlAg = new XueHtmlAgHelper();
|
||||
//XueHtmlAgHelper xueHtmlAg = new XueHtmlAgHelper();
|
||||
QingTingSpiderHelper tingSpiderHelper=new QingTingSpiderHelper();
|
||||
tingSpiderHelper.HtmltwlItemsData();
|
||||
//xueHtmlAg.GetDataRecruitDetail();
|
||||
Console.WriteLine("success!");
|
||||
Console.Read();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,149 @@
|
|||
using Aliyun.OSS;
|
||||
using HtmlAgilityPack;
|
||||
using ICSharpCode.SharpZipLib.Core;
|
||||
using Microsoft.AspNetCore.Mvc.RazorPages;
|
||||
using New_College.Common.Helper;
|
||||
using OpenQA.Selenium.Chrome;
|
||||
using OSS.Tools.Http;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using System.Net.Http;
|
||||
using System.Security.AccessControl;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace New_Spider
|
||||
{
|
||||
public class QingTingSpiderHelper
|
||||
{
|
||||
private string bucketName = "static-data-ycymedu"; //
|
||||
private string filePrefix = "qingting-data/";
|
||||
// 初始化 OSS 客户端
|
||||
private OssClient ossClient = new OssClient("https://oss-cn-shanghai.aliyuncs.com", "LTAI5tKs3TXSbt7E4WMDcxwR", "EvC8MjRaQC1kHubgU4MtecZnofOb0v");
|
||||
|
||||
public async Task HtmlCreatePageData()
|
||||
{
|
||||
|
||||
var options = new ChromeOptions();
|
||||
options.AddArgument("--headless"); // 如果需要无界面模式
|
||||
options.AddArgument("--disable-gpu");
|
||||
options.AddArgument("--no-sandbox");
|
||||
var list = new List<BaseZProfessionDto>();
|
||||
using (var driver = new ChromeDriver(options))
|
||||
{
|
||||
// 导航到目标页面
|
||||
driver.Navigate().GoToUrl("https://www.qingtingai.net/inquiryProfession");
|
||||
|
||||
// 获取页面内容
|
||||
var pageSource = driver.PageSource;
|
||||
|
||||
// 加载 HTML 内容
|
||||
HtmlDocument doc = new HtmlDocument();
|
||||
doc.LoadHtml(pageSource);
|
||||
|
||||
|
||||
var topCategoryNodes = doc.DocumentNode.SelectNodes("//div[contains(@class, 'Top-Category cateMargin')]");
|
||||
|
||||
if (topCategoryNodes != null)
|
||||
{
|
||||
foreach (var topCategory in topCategoryNodes)
|
||||
{
|
||||
// 提取主类标题
|
||||
// 调整 XPath,宽松匹配 class 值
|
||||
var titleNode = topCategory.SelectSingleNode(".//span[contains(@class, 'category-defult-title')]");
|
||||
string title = titleNode?.InnerText.Trim() ?? "无标题";
|
||||
|
||||
// 提取职业信息
|
||||
var jobNodes = topCategory.SelectNodes(".//ul/li");
|
||||
Console.WriteLine($"分类: {title}");
|
||||
var itemlist = new List<ZProfessionItemDto>();
|
||||
if (jobNodes != null)
|
||||
{
|
||||
foreach (var job in jobNodes)
|
||||
{
|
||||
var jobName = job.InnerText.Trim();
|
||||
var jobLink = job.SelectSingleNode(".//a")?.GetAttributeValue("href", "无链接");
|
||||
Console.WriteLine($" 职业名称: {jobName}, 链接: {jobLink}");
|
||||
if (jobLink.Contains("id="))
|
||||
{
|
||||
int idValue = int.Parse(jobLink.Split("id=")[1]);
|
||||
itemlist.Add(new ZProfessionItemDto() { id = idValue, name = jobName });
|
||||
}
|
||||
}
|
||||
}
|
||||
list.Add(new BaseZProfessionDto() { RootName = title, itemDtos = itemlist });
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("未找到匹配的分类!");
|
||||
}
|
||||
|
||||
|
||||
// 构建文件名和路径
|
||||
string objectName = $"{filePrefix}list.json";
|
||||
|
||||
// 上传到 OSS(同步上传)
|
||||
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(list.ToJson()));
|
||||
ossClient.PutObject(bucketName, objectName, stream);
|
||||
|
||||
Console.WriteLine($"Uploaded page to OSS as {objectName}");
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void HtmltwlItemsData()
|
||||
{
|
||||
using var httpClient = new HttpClient();
|
||||
|
||||
var geturlresult = httpClient.GetAsync("https://static-data-ycymedu.oss-cn-shanghai.aliyuncs.com/qingting-data/list.json").Result;
|
||||
if (geturlresult.IsSuccessStatusCode)
|
||||
{
|
||||
var list = System.Text.Json.JsonSerializer.Deserialize<List<BaseZProfessionDto>>(geturlresult.ReadContentAsStringAsync().Result);
|
||||
|
||||
list.ForEach(item =>
|
||||
{
|
||||
item.itemDtos.ForEach(async a =>
|
||||
{
|
||||
var gourl = $"https://www.qingtingai.net/api/career/get_career_info?id={a.id}&agt_host=www.qingtingai.net¤t_host=www.qingtingai.net";
|
||||
var response = httpClient.GetAsync(gourl).Result;
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
var jsonData = response.Content.ReadAsStringAsync().Result;
|
||||
using var stream = new MemoryStream(Encoding.UTF8.GetBytes(jsonData));
|
||||
// 构建文件名和路径
|
||||
string objectName = $"{filePrefix}{a.id}.json";
|
||||
ossClient.PutObject(bucketName, objectName, stream);
|
||||
Console.WriteLine($"Uploaded to OSS as {objectName}");
|
||||
Thread.Sleep(300);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class BaseZProfessionDto
|
||||
{
|
||||
public string RootName { get; set; }
|
||||
public List<ZProfessionItemDto> itemDtos { get; set; }
|
||||
}
|
||||
|
||||
public class ZProfessionItemDto
|
||||
{
|
||||
public long id { get; set; }
|
||||
|
||||
public string name { get; set; }
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue