| /** |
| * boilerpipe |
| * |
| * Copyright (c) 2009 Christian Kohlschütter |
| * |
| * The author licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package de.l3s.boilerpipe.sax; |
| |
| import java.io.IOException; |
| |
| import org.xml.sax.InputSource; |
| import org.xml.sax.SAXException; |
| |
| import de.l3s.boilerpipe.BoilerpipeInput; |
| import de.l3s.boilerpipe.BoilerpipeProcessingException; |
| import de.l3s.boilerpipe.document.TextDocument; |
| |
| /** |
| * Parses an {@link InputSource} using SAX and returns a {@link TextDocument}. |
| * |
| * @author Christian Kohlschütter |
| */ |
| public final class BoilerpipeSAXInput implements BoilerpipeInput { |
| private final InputSource is; |
| |
| private boolean stopped = false; |
| |
| private int maxTextBlocks = -1; |
| |
| /** |
| * Creates a new instance of {@link BoilerpipeSAXInput} for the given {@link InputSource}. |
| * |
| * @param is |
| * @throws SAXException |
| */ |
| public BoilerpipeSAXInput(final InputSource is, final int maxTextBlocks_) throws SAXException { |
| this.is = is; |
| stopped = false; |
| maxTextBlocks = maxTextBlocks_; |
| } |
| |
| /** |
| * Creates a new instance of {@link BoilerpipeSAXInput} for the given {@link InputSource}. |
| * |
| * @param is |
| * @throws SAXException |
| */ |
| public BoilerpipeSAXInput(final InputSource is) throws SAXException { |
| this.is = is; |
| stopped = false; |
| } |
| |
| /** |
| * Retrieves the {@link TextDocument} using a default HTML parser. |
| */ |
| public TextDocument getTextDocument() throws BoilerpipeProcessingException { |
| BoilerpipeHTMLParser boilerpipeHTMLParser = new BoilerpipeHTMLParser(maxTextBlocks); |
| TextDocument textDocument = getTextDocument(boilerpipeHTMLParser); |
| if (boilerpipeHTMLParser.hasStopped()) { |
| stopped = true; |
| } |
| return textDocument; |
| } |
| |
| /** |
| * Retrieves the {@link TextDocument} using the given HTML parser. |
| * |
| * @param parser The parser used to transform the input into boilerpipe's internal representation. |
| * @return The retrieved {@link TextDocument} |
| * @throws BoilerpipeProcessingException |
| */ |
| public TextDocument getTextDocument(final BoilerpipeHTMLParser parser) throws BoilerpipeProcessingException { |
| try { |
| parser.parse(is); |
| if (parser.hasStopped()) { |
| stopped = true; |
| } |
| } catch (IOException e) { |
| throw new BoilerpipeProcessingException(e); |
| } catch (SAXException e) { |
| throw new BoilerpipeProcessingException(e); |
| } |
| |
| return parser.toTextDocument(); |
| } |
| |
| /** |
| * Indicates a stop in parsing because the max input size has been reached. |
| * |
| * @return maxCount was reached? |
| */ |
| public boolean hasStopped() { |
| return stopped; |
| } |
| } |