bug fixed
parent
86eb0ff747
commit
f7d6c89b7a
|
|
@ -36,10 +36,10 @@ namespace New_College.Tasks
|
||||||
|
|
||||||
private async void DoWork(object state)
|
private async void DoWork(object state)
|
||||||
{
|
{
|
||||||
// BatchListAgHelper batchListAgHelper = new BatchListAgHelper();
|
// BatchListAgHelper batchListAgHelper = new BatchListAgHelper();
|
||||||
NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper();
|
NationWideNewsAgHelper anghelper = new NationWideNewsAgHelper();
|
||||||
var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-"));
|
var provineInfo = await this.sysRegionServices.Query(e => e.Level == 1 && !e.RegionCode.Contains("-"));
|
||||||
provineInfo.ForEach(p =>
|
provineInfo.ForEach(async p =>
|
||||||
{
|
{
|
||||||
//var years = new List<string>() { "2024", "2023", "2022" };
|
//var years = new List<string>() { "2024", "2023", "2022" };
|
||||||
//years.ForEach(y =>
|
//years.ForEach(y =>
|
||||||
|
|
@ -48,7 +48,7 @@ namespace New_College.Tasks
|
||||||
//});
|
//});
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var list = anghelper.HtmlCreatePageData(p.RegionCode);
|
var list = await anghelper.HtmlCreatePageData(p.RegionCode);
|
||||||
list.ForEach(async c =>
|
list.ForEach(async c =>
|
||||||
{
|
{
|
||||||
c.title = c.title.Replace($"{p.SimpleName}:", "");
|
c.title = c.title.Replace($"{p.SimpleName}:", "");
|
||||||
|
|
@ -68,7 +68,7 @@ namespace New_College.Tasks
|
||||||
OrderSort = 0,
|
OrderSort = 0,
|
||||||
IsDelete = false,
|
IsDelete = false,
|
||||||
Title = c.title,
|
Title = c.title,
|
||||||
Summary = HtmlHelper.ReplaceHtmlTag(c.detail).Length > 200 ? HtmlHelper.ReplaceHtmlTag(c.detail).Substring(0, 200) : HtmlHelper.ReplaceHtmlTag(c.detail)
|
Summary = c.summary
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,7 @@ namespace New_College.Tasks
|
||||||
{
|
{
|
||||||
public string title { get; set; }
|
public string title { get; set; }
|
||||||
public string author { get; set; }
|
public string author { get; set; }
|
||||||
|
public string summary { get; set; }
|
||||||
public DateTime pubtime { get; set; }
|
public DateTime pubtime { get; set; }
|
||||||
public string detail { get; set; }
|
public string detail { get; set; }
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -18,81 +18,212 @@ namespace New_College.Tasks
|
||||||
{
|
{
|
||||||
|
|
||||||
private string itemUrl = "https://gaokao.chsi.com.cn";
|
private string itemUrl = "https://gaokao.chsi.com.cn";
|
||||||
public List<NewsModels> HtmlCreatePageData(string provinceCode)
|
public async Task<List<NewsModels>> HtmlCreatePageData(string provinceCode)
|
||||||
{
|
{
|
||||||
|
|
||||||
HtmlWeb webClient = new HtmlWeb();
|
|
||||||
|
|
||||||
var apiUrl = "https://gaokao.chsi.com.cn/wap/news/search/5018267?ps=20&ss=";
|
|
||||||
var list = new List<NewsModels>();
|
var list = new List<NewsModels>();
|
||||||
var baseUrl = $"{apiUrl}{provinceCode.Replace("0000", "")}&_t={DateTimeOffset.Now.ToUnixTimeSeconds()}";
|
|
||||||
using var httpClient = new HttpClient();
|
switch (provinceCode)
|
||||||
var response = httpClient.GetAsync(baseUrl).Result;
|
|
||||||
if (response.IsSuccessStatusCode)
|
|
||||||
{
|
{
|
||||||
var jsonData = response.Content.ReadAsStringAsync().Result;
|
case "310000":
|
||||||
Console.WriteLine(jsonData);
|
list=await ShangHaiNewsList();
|
||||||
var resultlist = JsonSerializer.Deserialize<GaokaoObject>(jsonData);
|
break;
|
||||||
if (resultlist.msg.Any())
|
case "370000":
|
||||||
|
list= ShanDongNewsList();
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private async Task<List<NewsModels>> ShangHaiNewsList()
|
||||||
|
{
|
||||||
|
string BaseUrl = "https://www.shmeea.edu.cn";
|
||||||
|
// var web = new HtmlWeb();
|
||||||
|
var list = new List<NewsModels>();
|
||||||
|
var listUrl = $"{BaseUrl}/page/02200/index.html";
|
||||||
|
using var client = new HttpClient();
|
||||||
|
var bytes = await client.GetByteArrayAsync(listUrl);
|
||||||
|
var html = Encoding.UTF8.GetString(bytes); // ✅ 关键:手动用 UTF-8 解码
|
||||||
|
|
||||||
|
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
|
||||||
|
var listDoc = new HtmlDocument();
|
||||||
|
listDoc.LoadHtml(html);
|
||||||
|
|
||||||
|
// 核心修改:class 名模糊匹配 + 节点层级简化
|
||||||
|
var items = listDoc.DocumentNode.SelectNodes("//ul[contains(@class,'pageList')]/li");
|
||||||
|
if (items == null)
|
||||||
|
{
|
||||||
|
Console.WriteLine("❌ 页面列表未找到,可能被反爬或类名变动!");
|
||||||
|
File.WriteAllText("debug.html", listDoc.DocumentNode.OuterHtml); // 调试用
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
foreach (var li in items)
|
||||||
|
{
|
||||||
|
try
|
||||||
{
|
{
|
||||||
try
|
var aTag = li.SelectSingleNode("./a");
|
||||||
|
var span = li.SelectSingleNode("./span[@class='listTime']");
|
||||||
|
|
||||||
|
var href = aTag.GetAttributeValue("href", "");
|
||||||
|
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}{href}";
|
||||||
|
var title = aTag.GetAttributeValue("title", "")?.Trim();
|
||||||
|
var date = span?.InnerText?.Trim();
|
||||||
|
|
||||||
|
// PDF 直接跳过正文抓取
|
||||||
|
if (href.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
|
||||||
{
|
{
|
||||||
resultlist.msg.ForEach(o =>
|
list.Add(new NewsModels
|
||||||
{
|
{
|
||||||
Thread.Sleep(100);
|
title = title,
|
||||||
string newsUrl = $"{itemUrl}{o.uri}";
|
author = "上海市教育考试院",
|
||||||
HtmlDocument doc = webClient.Load(newsUrl);
|
pubtime = Convert.ToDateTime(date),
|
||||||
var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[2]/h2").InnerText;
|
summary = "该条为 PDF 附件,请点击查看。",
|
||||||
var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[2]").InnerText.Replace("来源:", "");
|
detail = $"<p><a href=\"{fullUrl}\" target=\"_blank\">点击下载附件</a></p>"
|
||||||
var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"app\"]/div[3]/div[1]/span[1]").InnerText;
|
|
||||||
foreach (var linkNode in doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null))
|
|
||||||
{
|
|
||||||
string href = linkNode.GetAttributeValue("href", "");
|
|
||||||
if (!string.IsNullOrEmpty(href) && href.StartsWith("/"))
|
|
||||||
{
|
|
||||||
string fullUrl = "https://gaokao.chsi.com.cn" + href;
|
|
||||||
linkNode.SetAttributeValue("href", fullUrl);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"article_dnull\"]").InnerHtml;
|
|
||||||
//if (!HtmlHelper.ReplaceHtmlTag(innerhtml).Contains("浏览器"))
|
|
||||||
//{
|
|
||||||
|
|
||||||
if (innerhtml.Contains(".pdf") || innerhtml.Contains(".doc") || innerhtml.Contains(".docx") || innerhtml.Contains(".xls") || innerhtml.Contains(".xlsx"))
|
|
||||||
{
|
|
||||||
innerhtml = $"{innerhtml}\n若有附件详情,请至本省招生考试院下载附件!!!";
|
|
||||||
}
|
|
||||||
|
|
||||||
list.Add(new NewsModels()
|
|
||||||
{
|
|
||||||
title = inntertitle,
|
|
||||||
author = author,
|
|
||||||
pubtime = Convert.ToDateTime(createtime),
|
|
||||||
detail = Regex.Replace(innerhtml, @"src=""(?!https?:\/\/)(.*?)""", @"src=""https://gaokao.chsi.com.cn$1""")
|
|
||||||
});
|
|
||||||
// }
|
|
||||||
});
|
});
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
|
||||||
|
// ✅ 正确方式:用 HtmlDocument.LoadHtml 解析字符串
|
||||||
|
|
||||||
|
// 加载详情页:同样用 HttpClient + UTF-8 解码
|
||||||
|
var detailBytes = await client.GetByteArrayAsync(fullUrl);
|
||||||
|
var detailHtml = Encoding.UTF8.GetString(detailBytes);
|
||||||
|
var detailDoc = new HtmlDocument();
|
||||||
|
detailDoc.LoadHtml(detailHtml);
|
||||||
|
|
||||||
|
var model = new NewsModels
|
||||||
{
|
{
|
||||||
Console.WriteLine(ex.Message);
|
author = "上海市教育考试院",
|
||||||
|
pubtime = DateTime.TryParse(date, out var pubTime) ? pubTime : DateTime.Now
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// 标题(剔除 small)
|
||||||
|
var titleNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_title']");
|
||||||
|
if (titleNode != null)
|
||||||
|
{
|
||||||
|
var small = titleNode.SelectSingleNode("./small");
|
||||||
|
if (small != null) small.Remove();
|
||||||
|
model.title = titleNode.InnerText.Trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 正文
|
||||||
|
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='ivs_content']");
|
||||||
|
model.detail = contentNode?.InnerHtml?.Trim() ?? "";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// 附件追加到正文
|
||||||
|
var links = contentNode?.SelectNodes(".//a[@href]");
|
||||||
|
if (links != null)
|
||||||
|
{
|
||||||
|
foreach (var link in links)
|
||||||
|
{
|
||||||
|
var hrefVal = link.GetAttributeValue("href", "");
|
||||||
|
if (Regex.IsMatch(hrefVal, @"\.(pdf|mp4|docx?|xls)$", RegexOptions.IgnoreCase))
|
||||||
|
{
|
||||||
|
var fullLink = hrefVal.StartsWith("http") ? hrefVal : $"{BaseUrl}{hrefVal}";
|
||||||
|
model.detail += $"<p>附件:<a href=\"{fullLink}\" target=\"_blank\">{link.InnerText}</a></p>";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 摘要
|
||||||
|
var plainText = Regex.Replace(model.detail, "<.*?>", "");
|
||||||
|
model.summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
|
||||||
|
list.Add(model);
|
||||||
|
|
||||||
|
Thread.Sleep(200); // 防止请求过快被封
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"❌ 抓取失败:{ex.Message}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
Console.WriteLine($"Failed to fetch page {baseUrl}, Status Code: {response.StatusCode}");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<NewsModels> ShanDongNewsList()
|
||||||
|
{
|
||||||
|
string BaseUrl = "https://www.sdzk.cn/";
|
||||||
|
|
||||||
|
var web = new HtmlWeb();
|
||||||
|
var listUrl = "https://www.sdzk.cn/NewsList.aspx?BCID=20&CID=1117";
|
||||||
|
var doc = web.Load(listUrl);
|
||||||
|
var newsList = new List<NewsModels>();
|
||||||
|
var listItems = doc.DocumentNode.SelectNodes("//div[@class='blockLine']//ul[@class='bd']/li/a");
|
||||||
|
if (listItems == null) return newsList;
|
||||||
|
foreach (var item in listItems)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// 基础字段
|
||||||
|
var href = item.GetAttributeValue("href", "");
|
||||||
|
var fullUrl = href.StartsWith("http") ? href : $"{BaseUrl}/{href.TrimStart('/')}";
|
||||||
|
var title = item.InnerText?.Trim();
|
||||||
|
|
||||||
|
// 发布时间解析
|
||||||
|
var dateMatch = Regex.Match(item.InnerHtml, @"<i>(\d{4}-\d{2}-\d{2})<\/i>");
|
||||||
|
var pubtime = dateMatch.Success ? dateMatch.Groups[1].Value : "未知时间";
|
||||||
|
|
||||||
|
// 加载详情页
|
||||||
|
var detailDoc = web.Load(fullUrl);
|
||||||
|
var contentNode = detailDoc.DocumentNode.SelectSingleNode("//div[@id='UCAP-CONTENT']");
|
||||||
|
var detailHtml = contentNode?.InnerHtml?.Trim() ?? "";
|
||||||
|
|
||||||
|
// 提取作者
|
||||||
|
var infoSpans = detailDoc.DocumentNode.SelectNodes("//div[@class='MLspan']/span");
|
||||||
|
string author = "未知作者";
|
||||||
|
if (infoSpans != null)
|
||||||
|
{
|
||||||
|
foreach (var span in infoSpans)
|
||||||
|
{
|
||||||
|
var text = span.InnerText.Trim();
|
||||||
|
if (text.StartsWith("作者:"))
|
||||||
|
{
|
||||||
|
author = text.Replace("作者:", "").Trim();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 封面图
|
||||||
|
string coverImg = "";
|
||||||
|
var firstImg = contentNode?.SelectSingleNode(".//img");
|
||||||
|
if (firstImg != null)
|
||||||
|
{
|
||||||
|
var src = firstImg.GetAttributeValue("src", "");
|
||||||
|
if (!string.IsNullOrEmpty(src))
|
||||||
|
{
|
||||||
|
coverImg = src.StartsWith("http") ? src : $"{BaseUrl}{src}";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 摘要
|
||||||
|
var plainText = Regex.Replace(detailHtml, "<.*?>", "");
|
||||||
|
var summary = plainText.Length > 120 ? plainText.Substring(0, 120) : plainText;
|
||||||
|
|
||||||
|
newsList.Add(new NewsModels
|
||||||
|
{
|
||||||
|
title = title,
|
||||||
|
author = author,
|
||||||
|
summary = summary,
|
||||||
|
detail = detailHtml,
|
||||||
|
pubtime = DateTime.Parse(pubtime)
|
||||||
|
});
|
||||||
|
|
||||||
|
Thread.Sleep(200); // 限速
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"处理失败:{ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newsList;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue