Добавьте файлы проекта.
This commit is contained in:
64
Parsing/HtmlParsingHelpers.cs
Normal file
64
Parsing/HtmlParsingHelpers.cs
Normal file
@@ -0,0 +1,64 @@
|
||||
using System;
|
||||
using System.Text.RegularExpressions;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace CRAWLER.Parsing;
|
||||
|
||||
internal static class HtmlParsingHelpers
|
||||
{
|
||||
public static string NormalizeWhitespace(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var decoded = HtmlEntity.DeEntitize(value);
|
||||
decoded = decoded.Replace('\u00A0', ' ');
|
||||
decoded = Regex.Replace(decoded, @"\s+", " ");
|
||||
return decoded.Trim();
|
||||
}
|
||||
|
||||
public static string NormalizeLabel(string value)
|
||||
{
|
||||
return NormalizeWhitespace(value)
|
||||
?.Replace(" :", ":")
|
||||
.Trim(':', ' ');
|
||||
}
|
||||
|
||||
public static string MakeAbsoluteUrl(string baseUrl, string urlOrPath)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(urlOrPath))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
if (Uri.TryCreate(urlOrPath, UriKind.Absolute, out var absoluteUri))
|
||||
{
|
||||
return absoluteUri.ToString();
|
||||
}
|
||||
|
||||
var baseUri = new Uri(baseUrl.TrimEnd('/') + "/");
|
||||
return new Uri(baseUri, urlOrPath.TrimStart('/')).ToString();
|
||||
}
|
||||
|
||||
public static string ExtractNodeTextExcludingChildParagraph(HtmlNode node)
|
||||
{
|
||||
if (node == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var clone = node.CloneNode(true);
|
||||
var paragraphs = clone.SelectNodes("./p");
|
||||
if (paragraphs != null)
|
||||
{
|
||||
foreach (var paragraph in paragraphs)
|
||||
{
|
||||
paragraph.Remove();
|
||||
}
|
||||
}
|
||||
|
||||
return NormalizeWhitespace(clone.InnerText);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user