| /******************************************************************************* |
| * |
| * Copyright (c) 2018 michaelcgood |
| * https://github.com/michaelcgood/HTML-to-Apache-POI-RichTextString/blob/master/LICENSE. |
| * |
| * Contributors: |
| * michaelcgood - initial API and implementation |
| * Sean Muir - Modified HEAVY_REGEX to remove col tags |
| * |
| *******************************************************************************/ |
| package org.eclipse.mdht.cda.xml.ui.handlers.html; |
| |
| import java.util.ArrayDeque; |
| import java.util.ArrayList; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.commons.lang3.StringUtils; |
| import org.apache.poi.ss.usermodel.Font; |
| import org.apache.poi.ss.usermodel.IndexedColors; |
| import org.apache.poi.ss.usermodel.RichTextString; |
| import org.apache.poi.ss.usermodel.Workbook; |
| import org.apache.poi.xssf.usermodel.XSSFRichTextString; |
| |
| import net.htmlparser.jericho.Config; |
| import net.htmlparser.jericho.Element; |
| import net.htmlparser.jericho.Source; |
| |
| /** |
| * HTMLToExcel - Used to attempt to put some level of the narrative content into the spreadsheet |
| * The routine was available under GNU leverage open source jericho |
| * |
| * @author seanmuir |
| * |
| */ |
| public class HTMLToExcel { |
| |
| private static final Pattern HEAVY_REGEX = Pattern.compile("(<col>)|(</col>)|(<br/>)|(</br>)|(<br />)|(< /br>)"); |
| |
| private static final int START_TAG = 0; |
| |
| private static final int END_TAG = 1; |
| |
| private static final String NEW_LINE = System.getProperty("line.separator"); |
| |
| public RichTextString fromHtmlToCellValue(String html, Workbook workBook) { |
| Config.IsHTMLEmptyElementTagRecognised = true; |
| |
| Matcher m = HEAVY_REGEX.matcher(html); |
| String replacedhtml = m.replaceAll(""); |
| StringBuilder sb = new StringBuilder(); |
| sb.insert(0, "<div>"); |
| sb.append(replacedhtml); |
| sb.append("</div>"); |
| String newhtml = sb.toString(); |
| Source source = new Source(newhtml); |
| List<RichTextDetails> cellValues = new ArrayList<RichTextDetails>(); |
| for (Element el : source.getAllElements("div")) { |
| cellValues.add(createCellValue(el.toString(), workBook)); |
| } |
| RichTextString cellValue = mergeTextDetails(cellValues); |
| |
| return cellValue; |
| } |
| |
| // |
| |
| // this returns a rich text string |
| private static RichTextString mergeTextDetails(List<RichTextDetails> cellValues) { |
| Config.IsHTMLEmptyElementTagRecognised = true; |
| StringBuilder textBuffer = new StringBuilder(); |
| Map<Integer, Font> mergedMap = new LinkedHashMap<Integer, Font>(550, .95f); |
| int currentIndex = 0; |
| for (RichTextDetails richTextDetail : cellValues) { |
| // textBuffer.append(BULLET_CHARACTER + " "); |
| currentIndex = textBuffer.length(); |
| for (Entry<Integer, Font> entry : richTextDetail.getFontMap().entrySet()) { |
| mergedMap.put(entry.getKey() + currentIndex, entry.getValue()); |
| } |
| textBuffer.append(richTextDetail.getRichText()).append(NEW_LINE); |
| } |
| |
| RichTextString richText = new XSSFRichTextString(textBuffer.toString()); |
| // for (int i = 0; i < textBuffer.length(); i++) { |
| // Font currentFont = mergedMap.get(i); |
| // if (currentFont != null) { |
| // richText.applyFont(i, i + 1, currentFont); |
| // } |
| // } |
| return richText; |
| } |
| |
| public static RichTextDetails createCellValue(String html, Workbook workBook) { |
| Config.IsHTMLEmptyElementTagRecognised = true; |
| Source source = new Source(html); |
| Map<String, TagInfo> tagMap = new LinkedHashMap<String, TagInfo>(550, .95f); |
| for (Element e : source.getChildElements()) { |
| getInfo(e, tagMap); |
| } |
| |
| StringBuilder sbPatt = new StringBuilder(); |
| sbPatt.append("(").append(StringUtils.join(tagMap.keySet(), "|")).append(")"); |
| String patternString = sbPatt.toString(); |
| Pattern pattern = Pattern.compile(patternString); |
| Matcher matcher = pattern.matcher(html); |
| |
| StringBuffer textBuffer = new StringBuffer(); |
| List<RichTextInfo> textInfos = new ArrayList<RichTextInfo>(); |
| ArrayDeque<RichTextInfo> richTextBuffer = new ArrayDeque<RichTextInfo>(); |
| while (matcher.find()) { |
| matcher.appendReplacement(textBuffer, ""); |
| TagInfo currentTag = tagMap.get(matcher.group(1)); |
| if (START_TAG == currentTag.getTagType()) { |
| richTextBuffer.push(getRichTextInfo(currentTag, textBuffer.length(), workBook)); |
| } else { |
| if (!richTextBuffer.isEmpty()) { |
| RichTextInfo info = richTextBuffer.pop(); |
| if (info != null) { |
| info.setEndIndex(textBuffer.length()); |
| textInfos.add(info); |
| } |
| } |
| } |
| } |
| matcher.appendTail(textBuffer); |
| Map<Integer, Font> fontMap = buildFontMap(textInfos, workBook); |
| |
| return new RichTextDetails(textBuffer.toString(), fontMap); |
| } |
| |
| private static Map<Integer, Font> buildFontMap(List<RichTextInfo> textInfos, Workbook workBook) { |
| Map<Integer, Font> fontMap = new LinkedHashMap<Integer, Font>(550, .95f); |
| |
| for (RichTextInfo richTextInfo : textInfos) { |
| if (richTextInfo.isValid()) { |
| for (int i = richTextInfo.getStartIndex(); i < richTextInfo.getEndIndex(); i++) { |
| fontMap.put( |
| i, |
| mergeFont(fontMap.get(i), richTextInfo.getFontStyle(), richTextInfo.getFontValue(), workBook)); |
| } |
| } |
| } |
| |
| return fontMap; |
| } |
| |
| @SuppressWarnings("deprecation") |
| private static Font mergeFont(Font font, STYLES fontStyle, String fontValue, Workbook workBook) { |
| if (font == null) { |
| font = workBook.createFont(); |
| } |
| |
| switch (fontStyle) { |
| case BOLD: |
| case EM: |
| case STRONG: |
| // font.setBoldweight(Font.BOLDWEIGHT_BOLD); |
| break; |
| case UNDERLINE: |
| font.setUnderline(Font.U_SINGLE); |
| break; |
| case ITALLICS: |
| font.setItalic(true); |
| break; |
| case PRE: |
| font.setFontName("Courier New"); |
| case COLOR: |
| if (!isEmpty(fontValue)) { |
| |
| font.setColor(IndexedColors.BLACK.getIndex()); |
| } |
| break; |
| default: |
| break; |
| } |
| |
| return font; |
| } |
| |
| private static RichTextInfo getRichTextInfo(TagInfo currentTag, int startIndex, Workbook workBook) { |
| RichTextInfo info = null; |
| switch (STYLES.fromValue(currentTag.getTagName())) { |
| case SPAN: |
| if (!isEmpty(currentTag.getStyle())) { |
| for (String style : currentTag.getStyle().split(";")) { |
| String[] styleDetails = style.split(":"); |
| if (styleDetails != null && styleDetails.length > 1) { |
| if ("COLOR".equalsIgnoreCase(styleDetails[0].trim())) { |
| info = new RichTextInfo(startIndex, -1, STYLES.COLOR, styleDetails[1]); |
| } |
| } |
| } |
| } |
| break; |
| default: |
| info = new RichTextInfo(startIndex, -1, STYLES.fromValue(currentTag.getTagName())); |
| break; |
| } |
| return info; |
| } |
| |
| private static boolean isEmpty(String str) { |
| return (str == null || str.trim().length() == 0); |
| } |
| |
| private static void getInfo(Element e, Map<String, TagInfo> tagMap) { |
| tagMap.put( |
| e.getStartTag().toString(), |
| new TagInfo(e.getStartTag().getName(), e.getAttributeValue("style"), START_TAG)); |
| if (e.getChildElements().size() > 0) { |
| List<Element> children = e.getChildElements(); |
| for (Element child : children) { |
| getInfo(child, tagMap); |
| } |
| } |
| if (e.getEndTag() != null) { |
| tagMap.put(e.getEndTag().toString(), new TagInfo(e.getEndTag().getName(), END_TAG)); |
| } else { |
| // Handling self closing tags |
| tagMap.put(e.getStartTag().toString(), new TagInfo(e.getStartTag().getName(), END_TAG)); |
| } |
| } |
| |
| } |