Created
October 6, 2021 13:13
-
-
Save ufuk/badf3b18f099a54a20a6bb6e9f8d5f1a to your computer and use it in GitHub Desktop.
Utility method to convert HTML text to plain text while preserving newlines (using jsoup as main dependency)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.commons.lang3.StringUtils; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.nodes.Node; | |
import org.jsoup.nodes.TextNode; | |
import java.util.List; | |
import java.util.stream.Collectors; | |
public final class HtmlToTextUtils { | |
public static String htmlToTextWhilePreservingNewlines(String htmlText) { | |
final Document document = Jsoup.parse(StringUtils.trimToEmpty(htmlText)); | |
String plainText = buildTextFromNodeWhilePreservingNewlines(document.body()).toString().trim(); | |
plainText = plainText.replaceAll(" +", " "); | |
plainText = plainText.lines().map(String::trim).collect(Collectors.joining("\n")); | |
plainText = plainText.replaceAll("\n{2,}", "\n\n"); | |
return plainText; | |
} | |
private static StringBuilder buildTextFromNodeWhilePreservingNewlines(Node node) { | |
final StringBuilder stringBuilder = new StringBuilder(); | |
if (node instanceof TextNode) { | |
TextNode textNode = (TextNode) node; | |
stringBuilder.append(textNode.text()); | |
} | |
for (Node childNode : node.childNodes()) { | |
stringBuilder.append(buildTextFromNodeWhilePreservingNewlines(childNode)); | |
} | |
if (node instanceof Element) { | |
final String tagName = ((Element) node).tagName(); | |
if (List.of("p", "br", "li", "h1", "h2", "h3", "h4", "h5", "h6").contains(tagName)) { | |
stringBuilder.append("\n"); | |
} | |
} | |
return stringBuilder; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.junit.jupiter.api.Test; | |
import static org.assertj.core.api.Assertions.assertThat; | |
class HtmlToTextUtilsTests { | |
@Test | |
void shouldCleanHtmlToTextWhilePreservingNewlines() { | |
final String html = "<h5>Description</h5><ul><li>%100 cotton</li><li>Flat pattern</li></ul><p><br></p><p><br></p><h5>Model Information</h5><p><strong>Chest:</strong> 90cm <strong>Waist:</strong> 60cm <strong>Width:</strong> 90cm <strong>Length:</strong> 180cm </p><p> The model is wearing <strong>L</strong> sized product.</p>"; | |
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html)) | |
.isEqualTo("Description\n" + | |
"%100 cotton\n" + | |
"Flat pattern\n" + | |
"\n" + | |
"Model Information\n" + | |
"Chest: 90cm Waist: 60cm Width: 90cm Length: 180cm\n" + | |
"The model is wearing L sized product."); | |
} | |
@Test | |
void shouldCleanHtmlToTextWhilePreservingNewlinesWhenNullHtml() { | |
final String html = null; | |
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html)).isEqualTo(""); | |
} | |
@Test | |
void shouldCleanHtmlToTextWhilePreservingNewlinesWhenEmptyHtml() { | |
final String html = ""; | |
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html)).isEqualTo(""); | |
} | |
@Test | |
void shouldCleanHtmlToTextWhilePreservingNewlinesWhenBlankHtml() { | |
final String html = " "; | |
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html)).isEqualTo(""); | |
} | |
@Test | |
void shouldCleanHtmlToTextWhilePreservingNewlinesWhenInvalidHtml() { | |
final String html = "<p>invalid<br>tags<p>"; | |
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html)) | |
.isEqualTo("invalid\n" + | |
"tags"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment