Files
CROWLER/Parsing/CatalogPageParser.cs
2026-04-04 10:52:30 +03:00

87 lines
3.1 KiB
C#
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System;
using System.Collections.Generic;
using CRAWLER.Models;
using HtmlAgilityPack;
namespace CRAWLER.Parsing;
internal sealed class CatalogPageParser
{
public IReadOnlyList<CatalogListItem> Parse(string html, string baseUrl)
{
var document = new HtmlDocument();
document.LoadHtml(html ?? string.Empty);
var items = new List<CatalogListItem>();
var nodes = document.DocumentNode.SelectNodes("//div[contains(concat(' ', normalize-space(@class), ' '), ' sivreestre ')]");
if (nodes == null)
{
return items;
}
foreach (var node in nodes)
{
var item = new CatalogListItem
{
RegistryNumber = ReadBlockValue(node, "№ в госреестре"),
Name = ReadBlockValue(node, "Наименование"),
TypeDesignation = ReadBlockValue(node, "Тип"),
Manufacturer = ReadBlockValue(node, "Производитель"),
VerificationInterval = ReadBlockValue(node, "МПИ"),
CertificateOrSerialNumber = ReadBlockValue(node, "Cвидетельство завод. номер")
?? ReadBlockValue(node, "Свидетельство завод. номер"),
DetailUrl = ReadDetailUrl(node, baseUrl),
DescriptionTypePdfUrl = ReadPdfUrl(node, "/prof/opisanie/", baseUrl),
MethodologyPdfUrl = ReadPdfUrl(node, "/prof/metodiki/", baseUrl)
};
if (!string.IsNullOrWhiteSpace(item.RegistryNumber) || !string.IsNullOrWhiteSpace(item.Name))
{
items.Add(item);
}
}
return items;
}
private static string ReadBlockValue(HtmlNode root, string header)
{
var block = FindBlockByHeader(root, header);
return HtmlParsingHelpers.ExtractNodeTextExcludingChildParagraph(block);
}
private static HtmlNode FindBlockByHeader(HtmlNode root, string header)
{
var blocks = root.SelectNodes("./div");
if (blocks == null)
{
return null;
}
foreach (var block in blocks)
{
var paragraph = block.SelectSingleNode("./p");
var label = HtmlParsingHelpers.NormalizeLabel(paragraph?.InnerText);
if (string.Equals(label, header, StringComparison.OrdinalIgnoreCase))
{
return block;
}
}
return null;
}
private static string ReadDetailUrl(HtmlNode root, string baseUrl)
{
var link = root.SelectSingleNode(".//div[contains(@class,'resulttable6')]/a[1]")
?? root.SelectSingleNode(".//div[contains(@class,'resulttable4')]/a[1]");
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
}
private static string ReadPdfUrl(HtmlNode root, string marker, string baseUrl)
{
var link = root.SelectSingleNode($".//a[contains(@href,'{marker}')]");
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
}
}