Добавьте файлы проекта.
This commit is contained in:
86
Parsing/CatalogPageParser.cs
Normal file
86
Parsing/CatalogPageParser.cs
Normal file
@@ -0,0 +1,86 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using CRAWLER.Models;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace CRAWLER.Parsing;
|
||||
|
||||
internal sealed class CatalogPageParser
|
||||
{
|
||||
public IReadOnlyList<CatalogListItem> Parse(string html, string baseUrl)
|
||||
{
|
||||
var document = new HtmlDocument();
|
||||
document.LoadHtml(html ?? string.Empty);
|
||||
|
||||
var items = new List<CatalogListItem>();
|
||||
var nodes = document.DocumentNode.SelectNodes("//div[contains(concat(' ', normalize-space(@class), ' '), ' sivreestre ')]");
|
||||
if (nodes == null)
|
||||
{
|
||||
return items;
|
||||
}
|
||||
|
||||
foreach (var node in nodes)
|
||||
{
|
||||
var item = new CatalogListItem
|
||||
{
|
||||
RegistryNumber = ReadBlockValue(node, "№ в госреестре"),
|
||||
Name = ReadBlockValue(node, "Наименование"),
|
||||
TypeDesignation = ReadBlockValue(node, "Тип"),
|
||||
Manufacturer = ReadBlockValue(node, "Производитель"),
|
||||
VerificationInterval = ReadBlockValue(node, "МПИ"),
|
||||
CertificateOrSerialNumber = ReadBlockValue(node, "Cвидетельство завод. номер")
|
||||
?? ReadBlockValue(node, "Свидетельство завод. номер"),
|
||||
DetailUrl = ReadDetailUrl(node, baseUrl),
|
||||
DescriptionTypePdfUrl = ReadPdfUrl(node, "/prof/opisanie/", baseUrl),
|
||||
MethodologyPdfUrl = ReadPdfUrl(node, "/prof/metodiki/", baseUrl)
|
||||
};
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(item.RegistryNumber) || !string.IsNullOrWhiteSpace(item.Name))
|
||||
{
|
||||
items.Add(item);
|
||||
}
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
private static string ReadBlockValue(HtmlNode root, string header)
|
||||
{
|
||||
var block = FindBlockByHeader(root, header);
|
||||
return HtmlParsingHelpers.ExtractNodeTextExcludingChildParagraph(block);
|
||||
}
|
||||
|
||||
private static HtmlNode FindBlockByHeader(HtmlNode root, string header)
|
||||
{
|
||||
var blocks = root.SelectNodes("./div");
|
||||
if (blocks == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
foreach (var block in blocks)
|
||||
{
|
||||
var paragraph = block.SelectSingleNode("./p");
|
||||
var label = HtmlParsingHelpers.NormalizeLabel(paragraph?.InnerText);
|
||||
if (string.Equals(label, header, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return block;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string ReadDetailUrl(HtmlNode root, string baseUrl)
|
||||
{
|
||||
var link = root.SelectSingleNode(".//div[contains(@class,'resulttable6')]/a[1]")
|
||||
?? root.SelectSingleNode(".//div[contains(@class,'resulttable4')]/a[1]");
|
||||
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
|
||||
}
|
||||
|
||||
private static string ReadPdfUrl(HtmlNode root, string marker, string baseUrl)
|
||||
{
|
||||
var link = root.SelectSingleNode($".//a[contains(@href,'{marker}')]");
|
||||
return HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, link?.GetAttributeValue("href", null));
|
||||
}
|
||||
}
|
||||
65
Parsing/DetailPageParser.cs
Normal file
65
Parsing/DetailPageParser.cs
Normal file
@@ -0,0 +1,65 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using CRAWLER.Models;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace CRAWLER.Parsing;
|
||||
|
||||
internal sealed class DetailPageParser
|
||||
{
|
||||
public ParsedInstrumentDetails Parse(string html, string baseUrl)
|
||||
{
|
||||
var document = new HtmlDocument();
|
||||
document.LoadHtml(html ?? string.Empty);
|
||||
|
||||
var rows = document.DocumentNode.SelectNodes("//table[contains(@class,'resulttable1')]//tr");
|
||||
var values = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
|
||||
if (rows != null)
|
||||
{
|
||||
foreach (var row in rows)
|
||||
{
|
||||
var cells = row.SelectNodes("./td");
|
||||
if (cells == null || cells.Count < 2)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var label = HtmlParsingHelpers.NormalizeLabel(cells[0].InnerText);
|
||||
var value = HtmlParsingHelpers.NormalizeWhitespace(cells[1].InnerText);
|
||||
if (!string.IsNullOrWhiteSpace(label))
|
||||
{
|
||||
values[label] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new ParsedInstrumentDetails
|
||||
{
|
||||
RegistryNumber = Get(values, "Номер в госреестре"),
|
||||
Name = Get(values, "Наименование"),
|
||||
TypeDesignation = Get(values, "Обозначение типа"),
|
||||
Manufacturer = Get(values, "Производитель"),
|
||||
VerificationInterval = Get(values, "Межповерочный интервал (МПИ)"),
|
||||
CertificateOrSerialNumber = Get(values, "Срок свидетельства или заводской номер"),
|
||||
AllowsBatchVerification = Get(values, "Допускается поверка партии"),
|
||||
HasPeriodicVerification = Get(values, "Наличие периодической поверки"),
|
||||
TypeInfo = Get(values, "Сведения о типе"),
|
||||
Purpose = Get(values, "Назначение"),
|
||||
Description = Get(values, "Описание"),
|
||||
Software = Get(values, "Программное обеспечение"),
|
||||
MetrologicalCharacteristics = Get(values, "Метрологические и технические характеристики"),
|
||||
Completeness = Get(values, "Комплектность"),
|
||||
Verification = Get(values, "Поверка"),
|
||||
RegulatoryDocuments = Get(values, "Нормативные и технические документы"),
|
||||
Applicant = Get(values, "Заявитель"),
|
||||
TestCenter = Get(values, "Испытательный центр"),
|
||||
DescriptionTypePdfUrl = HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, document.DocumentNode.SelectSingleNode("//a[contains(@href,'/prof/opisanie/')]")?.GetAttributeValue("href", null)),
|
||||
MethodologyPdfUrl = HtmlParsingHelpers.MakeAbsoluteUrl(baseUrl, document.DocumentNode.SelectSingleNode("//a[contains(@href,'/prof/metodiki/')]")?.GetAttributeValue("href", null))
|
||||
};
|
||||
}
|
||||
|
||||
private static string Get(IDictionary<string, string> values, string key)
|
||||
{
|
||||
return values.TryGetValue(key, out var value) ? value : null;
|
||||
}
|
||||
}
|
||||
64
Parsing/HtmlParsingHelpers.cs
Normal file
64
Parsing/HtmlParsingHelpers.cs
Normal file
@@ -0,0 +1,64 @@
|
||||
using System;
|
||||
using System.Text.RegularExpressions;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace CRAWLER.Parsing;
|
||||
|
||||
internal static class HtmlParsingHelpers
|
||||
{
|
||||
public static string NormalizeWhitespace(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var decoded = HtmlEntity.DeEntitize(value);
|
||||
decoded = decoded.Replace('\u00A0', ' ');
|
||||
decoded = Regex.Replace(decoded, @"\s+", " ");
|
||||
return decoded.Trim();
|
||||
}
|
||||
|
||||
public static string NormalizeLabel(string value)
|
||||
{
|
||||
return NormalizeWhitespace(value)
|
||||
?.Replace(" :", ":")
|
||||
.Trim(':', ' ');
|
||||
}
|
||||
|
||||
public static string MakeAbsoluteUrl(string baseUrl, string urlOrPath)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(urlOrPath))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
if (Uri.TryCreate(urlOrPath, UriKind.Absolute, out var absoluteUri))
|
||||
{
|
||||
return absoluteUri.ToString();
|
||||
}
|
||||
|
||||
var baseUri = new Uri(baseUrl.TrimEnd('/') + "/");
|
||||
return new Uri(baseUri, urlOrPath.TrimStart('/')).ToString();
|
||||
}
|
||||
|
||||
public static string ExtractNodeTextExcludingChildParagraph(HtmlNode node)
|
||||
{
|
||||
if (node == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var clone = node.CloneNode(true);
|
||||
var paragraphs = clone.SelectNodes("./p");
|
||||
if (paragraphs != null)
|
||||
{
|
||||
foreach (var paragraph in paragraphs)
|
||||
{
|
||||
paragraph.Remove();
|
||||
}
|
||||
}
|
||||
|
||||
return NormalizeWhitespace(clone.InnerText);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user