blob: 512156df41f9efedadfc63d20c6b9cc179b19f67 [file] [log] [blame]
package org.eclipse.stem.utility.generators;
/*******************************************************************************
* Copyright (c) 2009 IBM Corporation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
*******************************************************************************/
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
/**
* This class will extract area and population data from the Russian UN data
* source file (RussiaSorted.txt). It will generate both and AREA and POPULATION
* files under
* org.eclipse.stem.utility\dataMigration\input\AreaPopulationData.
*
*
* To configure, see configuration file :
* org.eclipse.stem.utility/configuration/RussiaDataExtractor.config
*
*
* To run, add path to configuration file in the run profile (i.e.
* C:\stemII\org.eclipse.stem.utility\configuration\RussiaDataExtractor.config)
*
* F Add the following values to the run configuration:
*
* C:\stemII\org.eclipse.stem.utility\dataMigration\input\ISO3166-1\ISO3166_1_en.txt
* C:\stemII\org.eclipse.stem.utility\dataMigration\input\properties\EuropeCountries.txt
* C:\stemII\org.eclipse.stem.utility\dataMigration\input\AreaPopulationData\
*
* Overview of the data generation process for the Europe data set. * Overview
* of the data generation process for the Russia data set.
*
* 1) Data Cleaning : start by running RussiaDataCleaner. This program will
* remove all unnecesary data (i.e. columns) in the original source file. In
* addition, this program will group the data based on its ID (i.e. NO,IT,FR,UK,
* etc). The result will be a file named RussiaSorted.txt with cleaned data.
*
* In other words : Russia.txt --> ||RussiaDataCleaner|| --> RussiaSorted.txt
*
* NOTE: RussiaSorted.txt will be used as the input for the remaining part of
* the process.
*
*
* 2) Data Converter : run RussiaDataConverter to convert file in cleaned format
* into Diva format.
*
*
* 3) Area and Population Data Extraction : run RussiaDataExtractor to extract
* area and population data of all Russian administrations.
*
*
*/
public class RussiaDataExtractor {
/**
* Our source files for ISO 3166-1 codes and for the list of target
* countries.
*/
private String iso3166_1 = null;
// private String namesFile = null;
private String targetFile = null;
private String outputDir = null;
private String workspace = null;
/**
* List for ISO 3166-1 codes and target countries.
*/
List targetList = null;
List codeList = null;
List dataList = null;
// List namesList = null;
/**
* An index to count the number of unnamed entries in the data set.
*/
int unkCounter = 1;
/**
* Constructor
*
* @param iso3166_1 -
* data file with the ISO3166-1 ALPHA_2 and ALPHA_3 codes.
* @param targetFile -
* data file where we find a list of countries for generation of
* their corresponding area.properties files.
* @param outputDir -
* the output directory for the files generated.
*
*/
// public RussiaDataExtractor(String iso3166_1,/* String namesFile, */
// String targetFile, String outputDir) {
// // Set values of global variables.
// this.iso3166_1 = iso3166_1;
// // this.namesFile = namesFile;
// this.targetFile = targetFile;
// this.outputDir = outputDir;
// targetList = null;
// }
/**
* @param configFile
*/
public RussiaDataExtractor(String configFile) {
// Load all configuration variables
HashMap<String, String> vars = GenUtils.loadConfigFile(configFile);
Set<String> keys = vars.keySet();
Iterator it = keys.iterator();
while (it.hasNext()) {
String var = (String) it.next();
// Assign values to configuration variables :
if (var.equals("WORKSPACE")) { //$NON-NLS-1$
workspace = vars.get(var);
} else if (var.equals("ISO3166_1")) { //$NON-NLS-1$
iso3166_1 = vars.get(var);
} else if (var.equals("TARGET_LIST")) { //$NON-NLS-1$
targetFile = vars.get(var);
} else if (var.equals("OUTPUT")) { //$NON-NLS-1$
outputDir = vars.get(var);
}
} // while
// The workspace variable should end with a slash character
if (workspace.endsWith("\\") == false) { //$NON-NLS-1$
workspace += "\\"; //$NON-NLS-1$
}
String path = workspace + GenConstants.PARAMS_DIR;
// Set remaining config vars using WORKSPACE (workspace relative)
if (iso3166_1 == null) {
iso3166_1 = path + GenConstants.ISO3166_1_FILE;
}
if (targetFile == null) {
targetFile = path + GenConstants.EUROPE_COUNTRIES;
}
if (outputDir == null) {
outputDir = GenConstants.DUMMY_DIR;
}
targetList = null;
}
/**
* This method sets up and runs this generator.
*
*/
public void run() {
// Read and keep in memory a list of the countries we want.
targetList = GenUtils.populateList(targetFile);
// Read and keep in memory a list of the ISO3166-1 ALPHA2, ALPHA3, and
// numeric codes.
codeList = GenUtils.populateList(iso3166_1);
// Load all level 1 names for Europe.
// namesList = GenUtils.populateList(namesFile);
// Run the converter: UN Europe -> Diva set format
process();
}
/**
*
* Generate area and population files from data extracted from
* EuropeSorted.txt The area and population files will be used by the
* generators to produce the property files.
*
*/
protected void process() {
ArrayList<String> repeated = new ArrayList<String>();
// A writer for the new file we are creating.
PrintWriter areaFile = null;
PrintWriter popFile = null;
// Object holding ISO3166-1 related data.
ISOCode1 isoCode = null;
// Name of the new properties file.
String fileName = null;
int size = targetList.size();
// For every country
for (int i = 0; i < size; i++) {
// Get the next file in the list.
String file = ((String) targetList.get(i)).trim();
// Open the data source file for this country.
BufferedReader reader = GenUtils.openReader(file);
System.out
.println("\t\t<<<< RUSSIA DATA EXTRACTOR -- Processing Country : " + file + " >>>>"); //$NON-NLS-1$ //$NON-NLS-2$
try {
if (reader != null) {
String buffer = null;
// We take a chunk of the data [BUFFER_MIN,BUFFER_MAX] to
// make processing more
// efficient, we dont need all of it.
int BUFFER_MIN = 0;
int BUFFER_MAX = 200;
while (GenUtils.EOF(buffer = reader.readLine()) != true) {
// Make sure that we mark all unknown data.
buffer = buffer.replace(",,", ",UNKNOWN,"); //$NON-NLS-1$ //$NON-NLS-2$
// Use only part of the buffer, not all of it for
// efficiency.
int max = (buffer.length() > BUFFER_MAX) ? BUFFER_MAX
: buffer.length();
String[] items = buffer.substring(BUFFER_MIN, max)
.split(","); //$NON-NLS-1$
// Get the ISO3166-1 info for this country
if (isoCode == null) {
String country = GenUtils.getCountryName(
items[RussiaDataFormattingHeader.COUNTRY]
.substring(0, 2), codeList);
isoCode = GenUtils.getISOCode(country, codeList);
}
// If the directory does not exist for this country,
// create it using ISO alpha3 code.
File countryDir = new File(outputDir);
if (countryDir != null && countryDir.exists() == false) {
// Directory does not exist. Create it.
countryDir.mkdir();
}
// Create and open a new area file.
if (areaFile == null) {
// Create the file name
fileName = countryDir + "\\" + isoCode.getName() //$NON-NLS-1$
+ "_AREA.txt"; //$NON-NLS-1$
// Create tne new file
areaFile = GenUtils.openWriter(fileName);
}
// Similarly, create and open a new population file.
if (popFile == null) {
// Create the file name
fileName = countryDir + "\\" + isoCode.getName() //$NON-NLS-1$
+ "_POPULATION.txt"; //$NON-NLS-1$
// Create tne new file
popFile = GenUtils.openWriter(fileName);
}
// Get relevant data for conversion.
// Extract area value
String area = items[RussiaDataFormattingHeader.AREA];
// Extract population value
String pop1 = items[RussiaDataFormattingHeader.POP1];
String pop2 = items[RussiaDataFormattingHeader.POP2];
// Extract location information
String name2 = items[RussiaDataFormattingHeader.NAME2];
String name3 = items[RussiaDataFormattingHeader.NAME3];
String location_name = null;
if (name3.equals("UNKNOWN") || name3.equals("n.a.")) { //$NON-NLS-1$ //$NON-NLS-2$
// If this is an unnamed location, then we use our
// convenation for
// naming unnamed locations :
// <ALPHA2_CODE>-UNK-<UNKNOWN_COUNTER_VALUE>
location_name = isoCode.getAlpha2() + "-UNK-" //$NON-NLS-1$
+ String.valueOf(unkCounter++);
} else {
// The location name is a concatenation of level 2
// administration (NAME3) and the level 1 container
// (NAME2). We ignore the level 0 container (NAME1).
location_name = name2 + ";" + name3; //$NON-NLS-1$
}
if (repeated.contains(location_name)) {
continue;
}
// remember this element
repeated.add(location_name);
String pop = null;
// Get first the estimated value and if it is zero, then
// get the real population value for Russia.
pop = pop1;
if (Double.parseDouble(pop) == 0.0) {
pop = pop2;
}
if (Double.parseDouble(pop) <= 0.0) {
System.out
.println("\t\tPopulation is zero for location : " //$NON-NLS-1$
+ location_name);
}
if (Double.valueOf(area) < 1.0) {
area = "1.0"; //$NON-NLS-1$
}
// Write area value
GenUtils.addData(areaFile, location_name + " : 1 : " //$NON-NLS-1$
+ Double.valueOf(area) + "\n"); //$NON-NLS-1$
// Write population value
GenUtils.addData(popFile, location_name + " : 1 : " //$NON-NLS-1$
+ Double.valueOf(pop).intValue() + "\n"); //$NON-NLS-1$
}// while
// Reset ISO Code
isoCode = null;
// Reset file name
fileName = null;
// Close the target files.
areaFile.close();
areaFile = null;
popFile.close();
popFile = null;
// Close all open resources.
reader.close();
// Run garbage collection
//System.gc();
}
} catch (IOException e) {
e.printStackTrace();
}
} // for
}
/**
* Main execution entry point.
*
* @param args
*/
public static void main(String[] args) {
// final int ISO_3166_1 = 0;
// final int TARGET_LIST = 1;
// final int OUTPUT_DIR = 2;
// final int PARAMS = 3;
//
// if (args.length < PARAMS) {
// UtilLogger.logInformation("--Wrong arguments--"); //$NON-NLS-1$
// System.out
// .println("\tTo run, please provide the following arguments : ");
// //$NON-NLS-1$
// UtilLogger.logInformation("\t\t ISO 3166-1 file"); //$NON-NLS-1$
// UtilLogger.logInformation("\t\t Target countries file"); //$NON-NLS-1$
// UtilLogger.logInformation("\t\t Output directory"); //$NON-NLS-1$
// return;
// }
// RussiaDataExtractor converter = new RussiaDataExtractor(
// args[ISO_3166_1], args[TARGET_LIST], args[OUTPUT_DIR]);
// converter.run();
final int CONFIG_FILE = 0;
final int PARAMS = 1;
if (args.length < PARAMS) {
UtilLogger.logInformation("\t\t--Wrong arguments--"); //$NON-NLS-1$
System.out
.println("\t\tTo run, please provide the following argument(s) : "); //$NON-NLS-1$
UtilLogger.logInformation("\t\t\t Configuration file"); //$NON-NLS-1$
System.exit(1);
}
RussiaDataExtractor extractor = new RussiaDataExtractor(
args[CONFIG_FILE]);
extractor.run();
}
}