| package org.eclipse.stem.utility.generators; |
| |
| /******************************************************************************* |
| * Copyright (c) 2009 IBM Corporation and others. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: |
| * IBM Corporation - initial API and implementation |
| *******************************************************************************/ |
| |
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.PrintWriter; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Set; |
| |
| /** |
| * STEM II Data Migration/Generation Utilities : Instances of this utility class |
| * clean the data for Canada. This program will remove all unnecesary data from |
| * the original data source file for Canada. The original data source file for |
| * Canada was downloaded from : |
| * http://www.geocomm.com/faq/copyright.html |
| * |
| * |
| * Add the following values to the run configuration: |
| * |
| * C:\workspace2\org.eclipse.stem.utility\dataMigration\input\properties\CountriesLight.txt |
| * C:\shapeFiles\canada\ |
| * |
| * on nelsons machine it used to be... |
| * |
| * C:\stemII\org.eclipse.stem.utility\dataMigration\input\properties\EuropeCountries.txt |
| * C:\EuropeData2007\Canada\ |
| * |
| * Overview of the data generation process for the Canada data set. |
| * |
| * 1) Data Cleaning : start by running CanadaDataCleaner. This program will |
| * remove all unnecesary data (i.e. columns) in the original source file. In |
| * addition, this program will group the data based on its ID (i.e. NO,IT,FR,UK, |
| * etc). The result will be a file named CanadaSorted.txt with cleaned data. |
| * |
| * In other words : Canada.txt --> ||CanadaDataCleaner|| --> CanadaSorted.txt |
| * |
| * NOTE: CanadaSorted.txt will be used as the input for the remaining part of |
| * the process. |
| * |
| * |
| * 2) Data Converter : run CanadaDataConverter to convert file in cleaned format |
| * into Diva format. |
| * |
| * |
| * 3) Area and Population Data Extraction : run CanadaDataExtractor to extract |
| * area and population data or all administrations in Canada. |
| * |
| */ |
| public class CanadaDataCleaner { |
| |
| private String targetFile = null; |
| |
| private String outputDir = null; |
| |
| /** |
| * List for ISO 3166-1 codes and target countries. |
| */ |
| List<String> targetList = null; |
| |
| List<String> codeList = null; |
| |
| /** |
| * Hash map to keep track of polygons. |
| */ |
| HashMap<String, String> map = null; |
| |
| /** |
| * Constructor |
| * |
| * @param targetFile - |
| * data file where we find a list of countries for generation of |
| * their corresponding area.properties files. |
| * @param outputDir - |
| * the output directory for the files generated. |
| * |
| */ |
| public CanadaDataCleaner(String targetFile, String outputDir) { |
| // Set values of global variables. |
| this.targetFile = targetFile; |
| this.outputDir = outputDir; |
| targetList = null; |
| map = new HashMap<String, String>(); |
| } |
| |
| /** |
| * This method sets up and runs the finder. |
| * |
| */ |
| public void run() { |
| // Read and keep in memory a list of the countries we want. |
| targetList = GenUtils.populateList(targetFile); |
| |
| // Now, process each item in the target list. |
| process(); |
| } |
| |
| /** |
| * Process our global list of target countries. |
| * |
| */ |
| protected void process() { |
| |
| // A writer for the new file we are creating. |
| PrintWriter canadaFile = null; |
| |
| // Name of the new properties file. |
| String fileName = null; |
| |
| int size = targetList.size(); |
| |
| for (int i = 0; i < size; i++) { |
| |
| // Get the next country in the list. |
| String file = targetList.get(i).trim(); |
| |
| // Open the data source file for this country. |
| BufferedReader reader = GenUtils.openReader(file); |
| UtilLogger.logInformation("<<<< Processing Country : " + file + " >>>>"); //$NON-NLS-1$ //$NON-NLS-2$ |
| try { |
| |
| if (reader != null) { |
| |
| String buffer = null; |
| |
| // We take a chunk of the data [BUFFER_MIN,BUFFER_MAX] to |
| // make processing more |
| // efficient, we dont need all of it. |
| int BUFFER_MIN = 0; |
| int BUFFER_MAX = 100; |
| |
| while (GenUtils.EOF(buffer = reader.readLine()) != true) { |
| |
| // Make sure that we mark all unknown data. |
| buffer = buffer.replace(",,,", ",UNKNOWN,UNKNOWN,"); //$NON-NLS-1$ //$NON-NLS-2$ |
| buffer = buffer.replace(",,", ",UNKNOWN,"); //$NON-NLS-1$ //$NON-NLS-2$ |
| |
| String[] items = buffer.substring(BUFFER_MIN, |
| BUFFER_MAX).split(","); //$NON-NLS-1$ |
| |
| // Clear buffer of data we dont need. |
| String cleanBuffer = clean(buffer); |
| |
| File countryDir = new File(outputDir); |
| |
| if (countryDir != null && countryDir.exists() == false) { |
| // Directory does not exist. Create it. |
| countryDir.mkdir(); |
| } |
| |
| // Create and open the new file. |
| if (canadaFile == null) { |
| fileName = countryDir + "\\" + "CanadaSorted.txt"; //$NON-NLS-1$ //$NON-NLS-2$ |
| // Create tne new properties.file for this country. |
| canadaFile = GenUtils.openWriter(fileName); |
| } |
| |
| String val = map |
| .get(items[CanadaDataCleaningHeader.NAME1]); |
| if (val != null) { |
| // Concatenate |
| val += cleanBuffer + "\n"; //$NON-NLS-1$ |
| } else { |
| // First entry |
| val = cleanBuffer + "\n"; //$NON-NLS-1$ |
| } |
| map.put(items[CanadaDataCleaningHeader.NAME1], val); |
| |
| }// while |
| |
| // Close all open resources. |
| reader.close(); |
| |
| if (canadaFile != null) { |
| String output = consolidateData(map); |
| if (output != null) { |
| GenUtils.addData(canadaFile, output); |
| } |
| canadaFile.close(); |
| canadaFile = null; |
| UtilLogger.logInformation("" //$NON-NLS-1$ |
| + fileName); |
| fileName = null; |
| } |
| |
| // Run garbage collection |
| //System.gc(); |
| |
| } |
| |
| } catch (IOException e) { |
| e.printStackTrace(); |
| } |
| |
| } // for |
| |
| } |
| |
| /** |
| * |
| * Get all the values in the map. |
| * |
| * @param map |
| * a map of locations |
| * |
| * @return String a string will all locations grouped secuentialy. |
| * |
| */ |
| String consolidateData(HashMap<String, String> map) { |
| |
| String value = null; |
| |
| if (map.isEmpty()) { |
| return null; |
| } |
| |
| Set<String> keys = map.keySet(); |
| |
| Iterator it = keys.iterator(); |
| |
| while (it.hasNext()) { |
| |
| String key = (String) it.next(); |
| |
| if (value == null) { |
| value = map.get(key); |
| } else { |
| value += map.get(key); |
| } |
| |
| } |
| |
| return value; |
| } |
| |
| /** |
| * Remove all irrelevant data from our buffer |
| */ |
| String clean(String buffer) { |
| // count by 1 |
| final int LIMIT = 9; |
| String newBuffer = null; |
| |
| String[] items = buffer.split(","); //$NON-NLS-1$ |
| |
| int index = 0; |
| |
| while (index < items.length) { |
| |
| if (index >= LIMIT) { |
| newBuffer += "," + items[index++]; //$NON-NLS-1$ |
| continue; |
| } |
| |
| // set the country name manually |
| |
| newBuffer = "CAN, CANADA"; //$NON-NLS-1$ |
| |
| switch (index) { |
| /* |
| case CanadaDataCleaningHeader.ID: |
| newBuffer = items[index]; |
| break; |
| */ |
| |
| |
| /* |
| case CanadaDataCleaningHeader.AREA: |
| newBuffer = "," + items[index]; //$NON-NLS-1$ |
| break; |
| |
| case CanadaDataCleaningHeader.COUNTRY: |
| newBuffer += "," + items[index]; //$NON-NLS-1$ |
| break; |
| */ |
| case CanadaDataCleaningHeader.NAME1: |
| newBuffer += "," + items[index]; //$NON-NLS-1$ |
| break; |
| /* |
| case CanadaDataCleaningHeader.NAME2: |
| newBuffer += "," + items[index]; //$NON-NLS-1$ |
| break; |
| |
| case CanadaDataCleaningHeader.NAME3: |
| newBuffer += "," + items[index]; //$NON-NLS-1$ |
| break; |
| |
| case CanadaDataCleaningHeader.POP1: |
| newBuffer += "," + items[index]; //$NON-NLS-1$ |
| break; |
| |
| case CanadaDataCleaningHeader.POP2: |
| newBuffer += "," + items[index]; //$NON-NLS-1$ |
| break; |
| */ |
| } |
| |
| index++; |
| |
| } |
| |
| return newBuffer; |
| |
| } |
| |
| /** |
| * Main execution entry point. |
| * |
| * @param args |
| */ |
| public static void main(String[] args) { |
| final int TARGET_LIST = 0; |
| final int OUTPUT_DIR = 1; |
| final int PARAMS = 2; |
| |
| if (args.length < PARAMS) { |
| UtilLogger.logInformation("--Wrong arguments--"); //$NON-NLS-1$ |
| System.out |
| .println("\tTo run, please provide the following arguments : "); //$NON-NLS-1$ |
| UtilLogger.logInformation("\t\t Target countries file"); //$NON-NLS-1$ |
| UtilLogger.logInformation("\t\t Output directory"); //$NON-NLS-1$ |
| return; |
| } |
| |
| CanadaDataCleaner formatter = new CanadaDataCleaner(args[TARGET_LIST], |
| args[OUTPUT_DIR]); |
| formatter.run(); |
| } |
| } |