97 lines
3.6 KiB
C#
97 lines
3.6 KiB
C#
using HtmlAgilityPack;
|
|
using OpenQA.Selenium.Chrome;
|
|
using OpenQA.Selenium;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Net;
|
|
using System.Text;
|
|
using System.Threading.Tasks;
|
|
|
|
namespace New_Spider.阳光高考
|
|
{
|
|
public class HtmlAgNewsHelper
|
|
{
|
|
public void HtmlCreatePageData()
|
|
{
|
|
HtmlWeb webClient = new HtmlWeb();
|
|
HtmlDocument doc = webClient.Load("https://www.sdzk.cn/NewsList.aspx?BCID=2");
|
|
|
|
HtmlNodeCollection categoryNodeList = doc.DocumentNode.SelectNodes("//*[@id=\"ctl00_ContentPlaceHolder1_ctl00_ContentPlaceHolder1_RadListView1Panel\"]/ul/li");
|
|
var listurls = new List<string>();
|
|
foreach (var item in categoryNodeList)//
|
|
{
|
|
var aa = item.ChildNodes[0];
|
|
Console.WriteLine(string.Format("https://www.sdzk.cn/{0}|{1}", item.ChildNodes[0].Attributes["href"].Value, item.InnerText));
|
|
listurls.Add(string.Format("https://www.sdzk.cn/{0}", item.ChildNodes[0].Attributes["href"].Value));
|
|
}
|
|
|
|
listurls.ForEach(url =>
|
|
{
|
|
|
|
var doc = webClient.Load(url);
|
|
|
|
var inntertitle = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/h3").InnerText;
|
|
|
|
var author = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/em").InnerText.Split("作者:")[1].Split(" ")[0];
|
|
var createtime = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/em").InnerText.Split("发布时间:")[1];
|
|
|
|
var innerhtml = doc.DocumentNode.SelectSingleNode("//*[@id=\"form1\"]/div[6]/div[2]/div").InnerHtml;
|
|
|
|
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
public void HtmlSpiderYangGuangData()
|
|
{
|
|
|
|
// 初始化 Chrome 驱动
|
|
var options = new ChromeOptions();
|
|
// 如果需要,添加无头模式选项
|
|
options.AddArgument("--headless");
|
|
options.AddArgument("--no-sandbox");
|
|
options.AddArgument("--disable-dev-shm-usage");
|
|
string driverExecutableFileName = "chromedriver";
|
|
string driverPath = "D:\\一草一木教育\\newgaokao\\New_Spider\\bin\\Debug\\net6.0\\";
|
|
ChromeDriverService service = ChromeDriverService.CreateDefaultService(driverPath, driverExecutableFileName);
|
|
var driver = new ChromeDriver(service, options);
|
|
|
|
// 导航到目标网页
|
|
driver.Navigate().GoToUrl("https://gaokao.chsi.com.cn/news/zszc.do");
|
|
|
|
// 等待页面加载完成,包括动态内容
|
|
Thread.Sleep(1000); // 这里使用简单的等待,更好的做法是使用 WebDriverWait
|
|
|
|
// 获取动态内容
|
|
var content = driver.FindElement(By.XPath("//*[@id=\"app\"]/div[2]/div[2]"));
|
|
|
|
|
|
// 也可以进一步获取子节点的子节点
|
|
IReadOnlyCollection<IWebElement> grandChildElements = driver.FindElements(By.XPath("//*[@id=\"app\"]/div[2]/div[2]/div[*]"));
|
|
foreach (IWebElement grandChildElement in grandChildElements)
|
|
{
|
|
var grandChild = grandChildElement.FindElement(By.TagName("span"));
|
|
var grandChildText = grandChild.Text;
|
|
var titletext = grandChildElement.FindElement(By.TagName("a")).Text;
|
|
var aurl = grandChildElement.FindElement(By.TagName("a")).GetAttribute("href");
|
|
Console.WriteLine($"Grandchild Element Text: {grandChildText}");
|
|
}
|
|
|
|
// 关闭浏览器驱动
|
|
driver.Quit();
|
|
|
|
// 打印动态内容
|
|
Console.WriteLine(content);
|
|
|
|
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|