65 lines
1.6 KiB
C#
65 lines
1.6 KiB
C#
using System;
|
|
using System.Text.RegularExpressions;
|
|
using HtmlAgilityPack;
|
|
|
|
namespace CRAWLER.Parsing;
|
|
|
|
internal static class HtmlParsingHelpers
|
|
{
|
|
public static string NormalizeWhitespace(string value)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(value))
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var decoded = HtmlEntity.DeEntitize(value);
|
|
decoded = decoded.Replace('\u00A0', ' ');
|
|
decoded = Regex.Replace(decoded, @"\s+", " ");
|
|
return decoded.Trim();
|
|
}
|
|
|
|
public static string NormalizeLabel(string value)
|
|
{
|
|
return NormalizeWhitespace(value)
|
|
?.Replace(" :", ":")
|
|
.Trim(':', ' ');
|
|
}
|
|
|
|
public static string MakeAbsoluteUrl(string baseUrl, string urlOrPath)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(urlOrPath))
|
|
{
|
|
return null;
|
|
}
|
|
|
|
if (Uri.TryCreate(urlOrPath, UriKind.Absolute, out var absoluteUri))
|
|
{
|
|
return absoluteUri.ToString();
|
|
}
|
|
|
|
var baseUri = new Uri(baseUrl.TrimEnd('/') + "/");
|
|
return new Uri(baseUri, urlOrPath.TrimStart('/')).ToString();
|
|
}
|
|
|
|
public static string ExtractNodeTextExcludingChildParagraph(HtmlNode node)
|
|
{
|
|
if (node == null)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var clone = node.CloneNode(true);
|
|
var paragraphs = clone.SelectNodes("./p");
|
|
if (paragraphs != null)
|
|
{
|
|
foreach (var paragraph in paragraphs)
|
|
{
|
|
paragraph.Remove();
|
|
}
|
|
}
|
|
|
|
return NormalizeWhitespace(clone.InnerText);
|
|
}
|
|
}
|