Добавьте файлы проекта.
This commit is contained in:
86
Parsing/CatalogPageParser.cs
Normal file
86
Parsing/CatalogPageParser.cs
Normal file
@@ -0,0 +1,86 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using CRAWLER.Models;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace CRAWLER.Parsing;
|
||||
|
||||
internal sealed class CatalogPageParser
|
||||
{
|
||||
public IReadOnlyList<CatalogListItem> Parse(string html, string baseUrl)
|
||||
{
|
||||
var document = new HtmlDocument();
|
||||
document.LoadHtml(html ?? string.Empty);
|
||||
|
||||
var items = new List<CatalogListItem>();
|
||||
var nodes = document.DocumentNode.SelectNodes("//div[contains(concat(' ', normalize-space(@class), ' '), ' sivreestre ')]");
|
||||
if (nodes == null)
|
||||
{
|
||||
return items;
|
||||
}
|
||||
|
||||
foreach (var node in nodes)
|
||||
{
|
||||
var item = new CatalogListItem
|
||||
{
|
||||
RegistryNumber = ReadBlockValue(node, "№ в госреестре"),
|
||||
Name = ReadBlockValue(node, "Наименование"),
|
||||
TypeDesignation = ReadBlockValue(node, "Тип"),
|
||||
Manufacturer = ReadBlockValue(node, "Производитель"),
|
||||
VerificationInterval = ReadBlockValue(node, "МПИ"),
|
||||
CertificateOrSerialNumber = ReadBlockValue(node, "Cвидетельство завод. номер")
|
||||
?? ReadBlockValue(node, "Свидетельство завод. номер"),
|
||||
DetailUrl = ReadDetailUrl(node, baseUrl),
|
||||
DescriptionTypePdfUrl = ReadPdfUrl(node, "/prof/opisanie/", baseUrl),
|
||||
MethodologyPdfUrl = ReadPdfUrl(node, "/prof/metodiki/", baseUrl)
|
||||
};
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(item.RegistryNumber) || !string.IsNullOrWhiteSpace(item.Name))
|
||||
{
|
||||
items.Add(item);
|
||||
}
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
private static string ReadBlockValue(HtmlNode root, string header)
|
||||
{
|
||||
var block = FindBlockByHeader(root, header);
|
||||
return HtmlParsingHelpers.ExtractNodeTextExcludingChildParagraph(block);
|
||||
}
|
||||
|
||||
private static HtmlNode FindBlockByHeader(HtmlNode root, string header)
|
||||
{
|
||||
var blocks = root.SelectNodes("./div");
|
||||
if (blocks == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
foreach (var block in blocks)
|
||||
{
|
||||
var paragraph = block.SelectSingleNode("./p");
|
||||
var label = HtmlParsingHelpers.NormalizeLabel(paragraph?.InnerText);
|
||||
if (string.Equals(label, header, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return block;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string ReadDetailUrl(HtmlNode root, string baseUrl)
|
||||
{
|
||||
var link = root.SelectSingleNode(".//div[contains(@class,'resulttable6')]/a[1]")
|
||||
?? root.SelectSingleNode(".//div[contains(@class,'resulttable4')]/a[1]");
|
||||
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
|
||||
}
|
||||
|
||||
private static string ReadPdfUrl(HtmlNode root, string marker, string baseUrl)
|
||||
{
|
||||
var link = root.SelectSingleNode($".//a[contains(@href,'{marker}')]");
|
||||
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user