using System; using System.Collections.Generic; using CRAWLER.Models; using HtmlAgilityPack; namespace CRAWLER.Parsing; internal sealed class CatalogPageParser { public IReadOnlyList Parse(string html, string baseUrl) { var document = new HtmlDocument(); document.LoadHtml(html ?? string.Empty); var items = new List(); var nodes = document.DocumentNode.SelectNodes("//div[contains(concat(' ', normalize-space(@class), ' '), ' sivreestre ')]"); if (nodes == null) { return items; } foreach (var node in nodes) { var item = new CatalogListItem { RegistryNumber = ReadBlockValue(node, "№ в госреестре"), Name = ReadBlockValue(node, "Наименование"), TypeDesignation = ReadBlockValue(node, "Тип"), Manufacturer = ReadBlockValue(node, "Производитель"), VerificationInterval = ReadBlockValue(node, "МПИ"), CertificateOrSerialNumber = ReadBlockValue(node, "Cвидетельство завод. номер") ?? ReadBlockValue(node, "Свидетельство завод. номер"), DetailUrl = ReadDetailUrl(node, baseUrl), DescriptionTypePdfUrl = ReadPdfUrl(node, "/prof/opisanie/", baseUrl), MethodologyPdfUrl = ReadPdfUrl(node, "/prof/metodiki/", baseUrl) }; if (!string.IsNullOrWhiteSpace(item.RegistryNumber) || !string.IsNullOrWhiteSpace(item.Name)) { items.Add(item); } } return items; } private static string ReadBlockValue(HtmlNode root, string header) { var block = FindBlockByHeader(root, header); return HtmlParsingHelpers.ExtractNodeTextExcludingChildParagraph(block); } private static HtmlNode FindBlockByHeader(HtmlNode root, string header) { var blocks = root.SelectNodes("./div"); if (blocks == null) { return null; } foreach (var block in blocks) { var paragraph = block.SelectSingleNode("./p"); var label = HtmlParsingHelpers.NormalizeLabel(paragraph?.InnerText); if (string.Equals(label, header, StringComparison.OrdinalIgnoreCase)) { return block; } } return null; } private static string ReadDetailUrl(HtmlNode root, string baseUrl) { var link = root.SelectSingleNode(".//div[contains(@class,'resulttable6')]/a[1]") ?? root.SelectSingleNode(".//div[contains(@class,'resulttable4')]/a[1]"); return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null)); } private static string ReadPdfUrl(HtmlNode root, string marker, string baseUrl) { var link = root.SelectSingleNode($".//a[contains(@href,'{marker}')]"); return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null)); } }