blob: 21f9d576bd7225a84be76472bf85eabf6274abeb [file] [log] [blame]
/*******************************************************************************
*
* Copyright (c) 2018 michaelcgood
* https://github.com/michaelcgood/HTML-to-Apache-POI-RichTextString/blob/master/LICENSE.
*
* Contributors:
* michaelcgood - initial API and implementation
* Sean Muir - Modified HEAVY_REGEX to remove col tags
*
*******************************************************************************/
package org.eclipse.mdht.cda.xml.ui.handlers.html;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.ss.usermodel.Font;
import org.apache.poi.ss.usermodel.IndexedColors;
import org.apache.poi.ss.usermodel.RichTextString;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import net.htmlparser.jericho.Config;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.Source;
/**
* HTMLToExcel - Used to attempt to put some level of the narrative content into the spreadsheet
* The routine was available under GNU leverage open source jericho
*
* @author seanmuir
*
*/
public class HTMLToExcel {
private static final Pattern HEAVY_REGEX = Pattern.compile("(<col>)|(</col>)|(<br/>)|(</br>)|(<br />)|(< /br>)");
private static final int START_TAG = 0;
private static final int END_TAG = 1;
private static final String NEW_LINE = System.getProperty("line.separator");
public RichTextString fromHtmlToCellValue(String html, Workbook workBook) {
Config.IsHTMLEmptyElementTagRecognised = true;
Matcher m = HEAVY_REGEX.matcher(html);
String replacedhtml = m.replaceAll("");
StringBuilder sb = new StringBuilder();
sb.insert(0, "<div>");
sb.append(replacedhtml);
sb.append("</div>");
String newhtml = sb.toString();
Source source = new Source(newhtml);
List<RichTextDetails> cellValues = new ArrayList<RichTextDetails>();
for (Element el : source.getAllElements("div")) {
cellValues.add(createCellValue(el.toString(), workBook));
}
RichTextString cellValue = mergeTextDetails(cellValues);
return cellValue;
}
//
// this returns a rich text string
private static RichTextString mergeTextDetails(List<RichTextDetails> cellValues) {
Config.IsHTMLEmptyElementTagRecognised = true;
StringBuilder textBuffer = new StringBuilder();
Map<Integer, Font> mergedMap = new LinkedHashMap<Integer, Font>(550, .95f);
int currentIndex = 0;
for (RichTextDetails richTextDetail : cellValues) {
// textBuffer.append(BULLET_CHARACTER + " ");
currentIndex = textBuffer.length();
for (Entry<Integer, Font> entry : richTextDetail.getFontMap().entrySet()) {
mergedMap.put(entry.getKey() + currentIndex, entry.getValue());
}
textBuffer.append(richTextDetail.getRichText()).append(NEW_LINE);
}
RichTextString richText = new XSSFRichTextString(textBuffer.toString());
// for (int i = 0; i < textBuffer.length(); i++) {
// Font currentFont = mergedMap.get(i);
// if (currentFont != null) {
// richText.applyFont(i, i + 1, currentFont);
// }
// }
return richText;
}
public static RichTextDetails createCellValue(String html, Workbook workBook) {
Config.IsHTMLEmptyElementTagRecognised = true;
Source source = new Source(html);
Map<String, TagInfo> tagMap = new LinkedHashMap<String, TagInfo>(550, .95f);
for (Element e : source.getChildElements()) {
getInfo(e, tagMap);
}
StringBuilder sbPatt = new StringBuilder();
sbPatt.append("(").append(StringUtils.join(tagMap.keySet(), "|")).append(")");
String patternString = sbPatt.toString();
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(html);
StringBuffer textBuffer = new StringBuffer();
List<RichTextInfo> textInfos = new ArrayList<RichTextInfo>();
ArrayDeque<RichTextInfo> richTextBuffer = new ArrayDeque<RichTextInfo>();
while (matcher.find()) {
matcher.appendReplacement(textBuffer, "");
TagInfo currentTag = tagMap.get(matcher.group(1));
if (START_TAG == currentTag.getTagType()) {
richTextBuffer.push(getRichTextInfo(currentTag, textBuffer.length(), workBook));
} else {
if (!richTextBuffer.isEmpty()) {
RichTextInfo info = richTextBuffer.pop();
if (info != null) {
info.setEndIndex(textBuffer.length());
textInfos.add(info);
}
}
}
}
matcher.appendTail(textBuffer);
Map<Integer, Font> fontMap = buildFontMap(textInfos, workBook);
return new RichTextDetails(textBuffer.toString(), fontMap);
}
private static Map<Integer, Font> buildFontMap(List<RichTextInfo> textInfos, Workbook workBook) {
Map<Integer, Font> fontMap = new LinkedHashMap<Integer, Font>(550, .95f);
for (RichTextInfo richTextInfo : textInfos) {
if (richTextInfo.isValid()) {
for (int i = richTextInfo.getStartIndex(); i < richTextInfo.getEndIndex(); i++) {
fontMap.put(
i,
mergeFont(fontMap.get(i), richTextInfo.getFontStyle(), richTextInfo.getFontValue(), workBook));
}
}
}
return fontMap;
}
@SuppressWarnings("deprecation")
private static Font mergeFont(Font font, STYLES fontStyle, String fontValue, Workbook workBook) {
if (font == null) {
font = workBook.createFont();
}
switch (fontStyle) {
case BOLD:
case EM:
case STRONG:
// font.setBoldweight(Font.BOLDWEIGHT_BOLD);
break;
case UNDERLINE:
font.setUnderline(Font.U_SINGLE);
break;
case ITALLICS:
font.setItalic(true);
break;
case PRE:
font.setFontName("Courier New");
case COLOR:
if (!isEmpty(fontValue)) {
font.setColor(IndexedColors.BLACK.getIndex());
}
break;
default:
break;
}
return font;
}
private static RichTextInfo getRichTextInfo(TagInfo currentTag, int startIndex, Workbook workBook) {
RichTextInfo info = null;
switch (STYLES.fromValue(currentTag.getTagName())) {
case SPAN:
if (!isEmpty(currentTag.getStyle())) {
for (String style : currentTag.getStyle().split(";")) {
String[] styleDetails = style.split(":");
if (styleDetails != null && styleDetails.length > 1) {
if ("COLOR".equalsIgnoreCase(styleDetails[0].trim())) {
info = new RichTextInfo(startIndex, -1, STYLES.COLOR, styleDetails[1]);
}
}
}
}
break;
default:
info = new RichTextInfo(startIndex, -1, STYLES.fromValue(currentTag.getTagName()));
break;
}
return info;
}
private static boolean isEmpty(String str) {
return (str == null || str.trim().length() == 0);
}
private static void getInfo(Element e, Map<String, TagInfo> tagMap) {
tagMap.put(
e.getStartTag().toString(),
new TagInfo(e.getStartTag().getName(), e.getAttributeValue("style"), START_TAG));
if (e.getChildElements().size() > 0) {
List<Element> children = e.getChildElements();
for (Element child : children) {
getInfo(child, tagMap);
}
}
if (e.getEndTag() != null) {
tagMap.put(e.getEndTag().toString(), new TagInfo(e.getEndTag().getName(), END_TAG));
} else {
// Handling self closing tags
tagMap.put(e.getStartTag().toString(), new TagInfo(e.getStartTag().getName(), END_TAG));
}
}
}