faet:爬虫数据部分提交

develop
old易 2023-09-14 18:32:40 +08:00
parent 51c8c9a526
commit 9edc8a7aa2
14 changed files with 400 additions and 10 deletions

View File

@ -1,4 +1,5 @@
using Microsoft.AspNetCore.Mvc;
using New_College.Common.Helper;
using New_College.IServices;
using System.Threading.Tasks;
@ -24,6 +25,7 @@ namespace New_College.Controllers
[HttpGet]
public IActionResult Get()
{
return Ok();
}
@ -35,7 +37,9 @@ namespace New_College.Controllers
public Task<bool> Post()
{
// return t_EnrollmentPlane.Import();
return t_EnrollmentPlane.Importuniverbaseinfo();
// return t_EnrollmentPlane.Importuniverbaseinfo();
return t_EnrollmentPlane.universitydetailupdate();
}
}

View File

@ -527,7 +527,7 @@
</member>
<member name="P:New_College.Model.Models.D_University.Nature">
<summary>
办学性质 1、公办2、民办
办学性质 1、公办2、民办3中外合作,4 港澳台
</summary>
</member>
<member name="P:New_College.Model.Models.D_University.Ascription">
@ -4799,6 +4799,11 @@
是否双一流
</summary>
</member>
<member name="P:New_College.Model.ViewModels.UniversityResult.QJJH">
<summary>
强基
</summary>
</member>
<member name="P:New_College.Model.ViewModels.UniversityResult.Logo">
<summary>
学校logo

View File

@ -0,0 +1,88 @@
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace New_College.Common.Helper
{
public static class UniversityTypeRelsove
{
/// <summary>
/// 院校类型转换
/// </summary>0综合,1理工类2医学类|3军事类|4语言类|5师范类|6财经类|7政法类|8民族类|9农林类|10艺术类|11体育类|12其他
/// <param name="type"></param>
/// <returns></returns>
public static string GetType(int type)
{
string typename = string.Empty;
switch (type)
{
case 0:
typename = "综合类";
break;
case 1:
typename = "理工类";
break;
case 2:
typename = "医学类";
break;
case 3:
typename = "军事类";
break;
case 4:
typename = "语言类";
break;
case 5:
typename = "师范类";
break;
case 6:
typename = "财经类";
break;
case 7:
typename = "政法类";
break;
case 8:
typename = "民族类";
break;
case 9:
typename = "农林类";
break;
case 10:
typename = "艺术类";
break;
case 11:
typename = "体育类";
break;
case 12:
typename = "其他";
break;
}
return typename;
}
public static string GetFileJson(string filepath)
{
string json = string.Empty;
using (FileStream fs = new FileStream(filepath, FileMode.Open, System.IO.FileAccess.Read, FileShare.ReadWrite))
{
using (StreamReader sr = new StreamReader(fs, Encoding.GetEncoding("utf-8")))
{
json = sr.ReadToEnd().ToString();
// var obj = JsonConvert.DeserializeObject(json);
}
}
return json;
}
}
}

View File

@ -14,7 +14,7 @@ namespace New_College.IServices
public interface IT_EnrollmentPlaneServices : IBaseServices<T_EnrollmentPlane>
{
Task<bool> Import();
Task<bool> universitydetailupdate();
Task<bool> Importuniverbaseinfo();
/// <summary>
/// 根据大学或者专业获取招生计划

View File

@ -35,7 +35,7 @@ namespace New_College.Model.Models
public string Build_Date { get; set; }
/// <summary>
/// 办学性质 1、公办2、民办
/// 办学性质 1、公办2、民办3中外合作,4 港澳台
/// </summary>
[SugarColumn(IsNullable = true)]
public int Nature { get; set; }

View File

@ -30,6 +30,11 @@ namespace New_College.Model.ViewModels
/// 是否双一流
/// </summary>
public bool? Syl { get; set; }
/// <summary>
/// 强基
/// </summary>
public bool? QJJH { get; set; }
/// <summary>
/// 学校logo
/// </summary>

View File

@ -304,6 +304,7 @@ namespace New_College.Services
Syl = university.Syl == 1 ? true : false,
Nhef = university.Nhef == 1 ? true : false,
Sff = university.Sff == 1 ? true : false,
QJJH = university.QJJH == 1 ? true : false,
UniversityType = university.Type,
SubjectLevel = university.Subject_Level,
Imglist = imgs,

View File

@ -9,6 +9,7 @@ using System.Threading.Tasks;
using System.Collections.Generic;
using New_College.IRepository;
using System.Linq;
using New_College.Common.Helper;
namespace New_College.Services
{
@ -161,14 +162,14 @@ namespace New_College.Services
{
Id = c.Id,
Name = c.Name,
Nature = c.Nature == 1 ? "公立" : "私立",
Nature = c.Nature == 0 ? "公办" : c.Nature == 1 ? "民办" : c.Nature == 2 ? "中外合作" : "港澳台合作",
AscriptionName = string.IsNullOrWhiteSpace(c.AscriptionName) ? "-" : c.AscriptionName,
Rank = c.Rank,
AreaName = c.Area_Name,
Syl = c.Syl == 1 ? "是" : "否",
Nhef = c.Nhef == 1 ? "是" : "否",
Sff = c.Sff == 1 ? "是" : "否",
UniversityType = c.Type == 0 ? "综合" : c.Type == 1 ? "理工" : "医学类",
UniversityType = UniversityTypeRelsove.GetType(c.Type),
BuildDate = c.Build_Date,
SubjectLevel = c.Subject_Level == 1 ? "本科" : c.Subject_Level == 2 ? "专科" : "-",
AcademicianCount = c.Academician_Count <= 0 ? "-" : c.Academician_Count.ToString(),

View File

@ -15,6 +15,10 @@ using New_College.Model.ViewModels.Result;
using New_College.Model.ViewModels.Query;
using New_College.Model;
using System.Text.RegularExpressions;
using New_College.Common.Helper;
using Newtonsoft.Json;
using System.IO;
using System.Threading;
namespace New_College.Services
{
@ -51,6 +55,54 @@ namespace New_College.Services
public async Task<bool> universitydetailupdate()
{
var universitylist = await this.d_University.Query(x => x.IsDelete == false);
for (int k = 0; k <= 2800; k += 20)
{
var jsontext = UniversityTypeRelsove.GetFileJson(string.Format(@"D:\\jsondoc\\2023-09-14\\{0}.json", k));
var jsontext2 = Regex.Unescape(jsontext.Remove(jsontext.Length - 1, 1).Remove(0, 1));
var result = JsonConvert.DeserializeObject<List<Class1>>(jsontext2);
result.ForEach(async c =>
{
Thread.Sleep(100);
if (!string.IsNullOrWhiteSpace(c.description))
{
if (universitylist.Any(k => k.Name == c.name.Trim())&& string.IsNullOrWhiteSpace(universitylist.FirstOrDefault(k => k.Name == c.name.Trim()).Description))
{
var defaultinfo = universitylist.FirstOrDefault(k => k.Name == c.name.Trim());
defaultinfo.Description = c.description;
defaultinfo.ModifyTime = DateTime.Now;
await this.d_University.Update(defaultinfo);
}
}
});
}
return true;
}
public class Class1
{
public string id { get; set; }
public string name { get; set; }
public string description { get; set; }
}
/// <summary>
///
/// </summary>
@ -97,7 +149,7 @@ namespace New_College.Services
single.Syl = issyl == "双一流" ? 1 : 0;
single.Subject_Level = schooltype == "本科" ? 0 : 1;
single.ModifyTime = DateTime.Now;
await this.d_University.Update(single);
await this.d_University.Update(single);
}
else

View File

@ -1,7 +1,7 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.30114.105
# Visual Studio Version 17
VisualStudioVersion = 17.5.33627.172
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "New_College.Api", "New_College.Api\New_College.Api.csproj", "{6F47A41A-085E-4422-BB73-5A2CBAA07D9F}"
EndProject
@ -27,6 +27,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "New_College.ConsoleApp", "N
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "New_College.AdminMvc", "New_College.AdminMvc\New_College.AdminMvc.csproj", "{06D885F3-6352-4BF6-B826-DEA742DFFBD7}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "New_Spider", "New_Spider\New_Spider.csproj", "{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@ -81,6 +83,10 @@ Global
{06D885F3-6352-4BF6-B826-DEA742DFFBD7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{06D885F3-6352-4BF6-B826-DEA742DFFBD7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{06D885F3-6352-4BF6-B826-DEA742DFFBD7}.Release|Any CPU.Build.0 = Release|Any CPU
{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E23857BF-DFBB-49DD-A86A-1B2932F6D33B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

172
New_Spider/HtmlAgHelper.cs Normal file
View File

@ -0,0 +1,172 @@
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
namespace New_Spider
{
public class HtmlAgHelper
{
/// <summary>
/// 阳光高考完数据解析
/// </summary>
public void HtmlCreatePageData()
{
HtmlWeb webClient = new HtmlWeb();
ITextGen textgen = new ITextGen();
//for (int k = 0; k <= 2800; k += 20)
for (int k = 0; k <= 1; k += 20)
{
var jsonobjs = new List<JsonObj>();
HtmlDocument doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-{0}.dhtml", k));
var rootNode = doc.DocumentNode;
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]");
int categoryIndex = 0;
foreach (var items in categoryNodeList)//
{
Thread.Sleep(500);
categoryIndex++;
var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a");
Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText));
HtmlUniversityAgResolve(doc, webClient, item, item.InnerText.Trim(),jsonobjs);
}
// Console.WriteLine(k);
Thread.Sleep(1000);
textgen.GenJson(Newtonsoft.Json.JsonConvert.SerializeObject(jsonobjs), k.ToString());
}
}
public class JsonObj
{
public string id { get; set; }
public string name { get; set; }
public string description { get; set; }
}
/// <summary>
/// 解析院校信息页面
/// </summary>
/// <param name="htmldoc"></param>
/// <param name="htmlWeb"></param>
/// <param name="htmlNode"></param>
/// <param name="name"></param>
public void HtmlUniversityAgResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, HtmlNode htmlNode,string name, List<JsonObj> jsons)
{
htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", htmlNode.Attributes["href"].Value));
var rootNode = htmldoc.DocumentNode;
HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[3]/div/a[2]");
if (items != null)
{
foreach (var item in items)//
{
Thread.Sleep(500);
try
{
HtmlUniversityDescriptionResolve(htmldoc, htmlWeb, item.Attributes["href"].Value, name, jsons);
}
catch (Exception e)
{
}
}
}
}
/// <summary>
/// 解析院校描述模块
/// </summary>
/// <param name="htmldoc"></param>
/// <param name="htmlWeb"></param>
/// <param name="htmlNode"></param>
/// <param name="name"></param>
public void HtmlUniversityDescriptionResolve(HtmlDocument htmldoc, HtmlWeb htmlWeb, string url, string name, List<JsonObj> jsons)
{
var href = url;
htmldoc = htmlWeb.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面
var rootNode = htmldoc.DocumentNode;
HtmlNodeCollection items = rootNode.SelectNodes("/html/body/div[1]/div[4]/div[3]");
if (items != null)
{
foreach (var item in items)//
{
var description = item.InnerText; //院校简介
jsons.Add(new JsonObj()
{
description = description,
name = name,
id = DateTime.Now.ToString("yyyyMMHHddmmssfff"),
});
}
}
}
public void HtmlAg()
{
HtmlWeb webClient = new HtmlWeb();
HtmlDocument doc = webClient.Load("https://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-0.dhtml");
var rootNode = doc.DocumentNode;
HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[*]");
int categoryIndex = 0;
foreach (var items in categoryNodeList)//
{
categoryIndex++;
var item = items.SelectSingleNode("//*[@id=\"app-yxk-sch-list\"]/div[1]/div[" + categoryIndex + "]/div[1]/div/a");
Console.WriteLine(string.Format("item: a:https://gaokao.chsi.com.cn/{0};name:{1}", item.Attributes["href"].Value, item.InnerText));
doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", item.Attributes["href"].Value));
var rootNode2 = doc.DocumentNode;
HtmlNodeCollection categoryNodeList2 = rootNode2.SelectNodes("/html/body/div[1]/div[3]/div/a[2]");
foreach (var itemsd in categoryNodeList2)//
{
var href = itemsd.Attributes["href"].Value;
doc = webClient.Load(string.Format("https://gaokao.chsi.com.cn/{0}", href));//院校页面
var rootNode3 = doc.DocumentNode;
HtmlNodeCollection categoryNodeList3 = rootNode3.SelectNodes("/html/body/div[1]/div[4]/div[3]");
foreach (var itemsd3 in categoryNodeList3)//
{
var contexts = itemsd3.InnerText; //院校简介
}
}
}
}
}
}

28
New_Spider/ITextGen.cs Normal file
View File

@ -0,0 +1,28 @@
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Security.AccessControl;
using System.Text;
using System.Threading.Tasks;
using System.Xml;
namespace New_Spider
{
public class ITextGen
{
/// <summary>
/// 生成json文件
/// </summary>
/// <param name="jsons"></param>
public void GenJson(string jsons, string Ids)
{
string jsonString = JsonConvert.SerializeObject(jsons, Newtonsoft.Json.Formatting.Indented);
File.WriteAllText(string.Format(@"D:\\jsondoc\\2023-09-14\\{0}.json",Ids), jsonString);
}
}
}

View File

@ -0,0 +1,15 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.11.53" />
<PackageReference Include="Newtonsoft.Json" Version="12.0.3" />
</ItemGroup>
</Project>

13
New_Spider/Program.cs Normal file
View File

@ -0,0 +1,13 @@

using HtmlAgilityPack;
using New_Spider;
using System.Text.RegularExpressions;
HtmlAgHelper htmlAgHelper = new HtmlAgHelper();
htmlAgHelper.HtmlCreatePageData();
Console.Read();
// See https://aka.ms/new-console-template for more information
Console.WriteLine("Hello, World!");