blob: 1b5eac5f9fdb5d411fb898dde6b5f039e4767436 [file] [log] [blame]
/**
*
*/
package org.eclipse.stem.utility.generators;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
/*******************************************************************************
* Copyright (c) 2006 IBM Corporation and others. All rights reserved. This
* program and the accompanying materials are made available under the terms of
* the Eclipse Public License v1.0 which accompanies this distribution, and is
* available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: IBM Corporation - initial API and implementation
******************************************************************************/
/**
* STEM II Data Migration/Generation Utilities : Instances of this class sort
* all level 1 ISO3166-2 descriptors. To do so, first it extracts all the
* ISO3166-2 descriptors we have distributed in the following two files:
* ISO3166_2_list.txt and ISO_LEVEL1_FIX.txt. The output is written into a
* semicolon separated format of the form:
*
* COUNTRY:ALPHA2:XX-YY:NAME:INDEX
*
* This output will be used during generation of the level 2 descriptors and
* during generation of the relationship files.
*
* To configure, see configuration file :
* org.eclipse.stem.utility/configuration/Admin2LexSorter.config
*
*
* To run, add path to configuration file in the run profile (i.e.
* C:\stemII\org.eclipse.stem.utility\configuration\Admin2LexSorter.config)
*
*
*/
public class Admin2LexSorter {
/**
* Our source files for ISO 3166-1 codes and for the list of target
* countries.
*/
private String iso3166_1 = null;
private String targetFile = null;
private String outputDir = null;
private String workspace = null;
/**
* List for ISO 3166-1 codes and target countries.
*/
List targetList = null;
List codeList = null;
/**
* Hash map for dealing with
*/
HashMap<String, List<String>> adminData = null;
/**
* Constructor
*
* @param iso3166_1 -
* data file with the ISO3166-1 ALPHA_2 and ALPHA_3 codes.
* @param targetFile -
* data file where we find a list of countries for generation of
* their corresponding area.properties files.
* @param outputDir -
* the output directory for the files generated.
*
*/
// public Admin2LexSorter(String iso3166_1, String targetFile, String
// outputDir) {
// // Set values of global variables.
// this.iso3166_1 = iso3166_1;
// this.targetFile = targetFile;
// this.outputDir = outputDir;
// targetList = null;
// adminData = new HashMap<String, List<String>>();
// }
/**
* @param configFile
*/
public Admin2LexSorter(String configFile) {
// Load all configuration variables
HashMap<String, String> vars = GenUtils.loadConfigFile(configFile);
Set<String> keys = vars.keySet();
Iterator it = keys.iterator();
while (it.hasNext()) {
String var = (String) it.next();
// Assign values to configuration variables :
if (var.equals("WORKSPACE")) { //$NON-NLS-1$
workspace = vars.get(var);
} else if (var.equals("ISO3166_1")) { //$NON-NLS-1$
iso3166_1 = vars.get(var);
} else if (var.equals("TARGET_LIST")) { //$NON-NLS-1$
targetFile = vars.get(var);
} else if (var.equals("PROPERTIES_OUTPUT")) { //$NON-NLS-1$
outputDir = vars.get(var);
}
} // while
// The workspace variable should end with a slash character
if (workspace.endsWith("\\") == false) { //$NON-NLS-1$
workspace += "\\"; //$NON-NLS-1$
}
String path = workspace + GenConstants.PARAMS_DIR;
// Set remaining config vars using WORKSPACE (workspace relative)
if (iso3166_1 == null) {
iso3166_1 = path + GenConstants.ISO3166_1_FILE;
}
if (targetFile == null) {
targetFile = path + GenConstants.COUNTRIES_LIST;
}
if (outputDir == null) {
outputDir = path + GenConstants.LEX_DIR;
}
targetList = null;
adminData = new HashMap<String, List<String>>();
}
/**
* This method sets up and runs this generator.
*
*/
public void run() {
// Read and keep in memory a list of the countries we want.
targetList = GenUtils.populateList(targetFile);
// Read and keep in memory a list of the ISO3166-1 ALPHA2, ALPHA3, and
// numeric codes.
codeList = GenUtils.populateList(iso3166_1);
// Collect all level 2 data, sort it and store the result.
process();
// Now, print the result.
printSortedSequences();
}
/**
* Collect all level 2 administration data and its containment relationship
* to level 1 data. Sort the collections lexicographically and store the
* result for each country.
*
*/
protected void process() {
// A writer for the new file we are creating.
PrintWriter lexFile = null;
// Object holding ISO3166-1 related data.
ISOCode1 isoCode = null;
// Name of the new properties file.
String fileName = null;
int size = targetList.size();
// For every country
for (int i = 0; i < size; i++) {
// Get the next country in the list.
String file = ((String) targetList.get(i)).trim();
// Get the level 2 data we need :
String admin1 = null;
String admin2 = null;
// An index for unknwon or n.a. entries
int unkCounter = 1;
// Open the data source file for this country.
BufferedReader reader = GenUtils.openReader(file);
System.out
.println("\t\t<<<< Processing country : " + file + " >>>>"); //$NON-NLS-1$ //$NON-NLS-2$
try {
if (reader != null) {
String buffer = null;
// Describes level
// of data we
// are now processing : LEVEL0, LEVEL1, or LEVEL2.
int level = -1;
// Record the previous level.
int oldlevel = level - 1;
// We take a chunk of the data [BUFFER_MIN,BUFFER_MAX] to
// make processing more
// efficient, we dont need all of it.
int BUFFER_MIN = 0;
int BUFFER_MAX = 120;
List<String> entryList = null;
while (GenUtils.EOF(buffer = reader.readLine()) != true) {
// Make sure that we mark all unknown data.
buffer = buffer.replace(",,", ",UNKNOWN,"); //$NON-NLS-1$ //$NON-NLS-2$
String[] items = GenUtils
.extract(buffer.length() < BUFFER_MAX ? buffer
: buffer.substring(BUFFER_MIN,
BUFFER_MAX));
// Get the ISO3166-1 info for this country
if (isoCode == null) {
isoCode = GenUtils.getISOCode(
items[ISOData.COUNTRY_NAME], codeList);
}
// Get the level of the data we are processing.
level = GenUtils.getDataLevel(items);
// If the directory does not exist for this country,
// create it using ISO alpha3 code.
File countryDir = new File(outputDir
+ isoCode.getAlpha3());
if (countryDir != null && countryDir.exists() == false) {
// Directory does not exist. Create it.
countryDir.mkdir();
}
// Create and open the new file.
if (lexFile == null) {
// Create the file name
fileName = countryDir + "\\" + isoCode.getAlpha3() //$NON-NLS-1$
+ "_ADMIN2_LEX.txt"; //$NON-NLS-1$
// Create tne new file
lexFile = GenUtils.openWriter(fileName);
}
// This variable indicates a transition in our data
// processing
boolean transition = (level != oldlevel) ? true : false;
// We have switched to a different data level (i,e,
// admin 0 to admin 1)
if (transition == true) {
oldlevel = level;
// Reset our repeated history
// repeatedEntries = new String();
entryList = new ArrayList<String>();
}
// Ignore all data that is not level 2.
if (level == DataTypes.LEVEL_0
|| level == DataTypes.LEVEL_1) {
continue;
}
// Get the level 2 data we need :
admin1 = items[ISOData.ADMIN1_NAME];
admin2 = items[ISOData.ADMIN2_NAME];
// If entity is UNKOWN or N.A. handle it properly.
if (admin2.equals("UNKNOWN") || admin2.equals("N.A.")) { //$NON-NLS-1$ //$NON-NLS-2$
admin2 = isoCode.getAlpha3() + ".L2." //$NON-NLS-1$
+ String.valueOf(unkCounter++);
}
// Make sure this is not a repeated entry.
if (GenUtils.isRepeated(admin1 + admin2, entryList) == true) {
continue;
}
// Remember this entity to avoid duplicates.
entryList.add(admin1 + admin2);
if (adminData.get(admin1) == null) {
// Create a new ArrayList of administration 2
// objects for every level 1 location.
List<String> admin2List = new ArrayList<String>();
admin2List.add(admin2);
adminData.put(admin1, admin2List);
} else {
List<String> admin2List = adminData.get(admin1);
if (admin2List == null) {
admin2List = new ArrayList<String>();
}
admin2List.add(admin2);
adminData.put(admin1, admin2List);
}
}// while
if (adminData == null) {
continue;
}
Set<String> keys = adminData.keySet();
Iterator it = keys.iterator();
// Sort every list.
while (it.hasNext()) {
List<String> admin2List = adminData.get(it.next());
Collections.sort(admin2List);
}
System.out
.println("\t\tDescriptors by lexicographic ordering : "); //$NON-NLS-1$
Set<String> keyset = adminData.keySet();
Iterator it1 = keyset.iterator();
while (it1.hasNext()) {
admin1 = (String) it1.next();
// admin1 is the key in our HashMap.
List<String> admin2List = adminData.get(admin1);
Iterator it2 = admin2List.iterator();
int index = 1;
UtilLogger.logInformation("\t\tAdmin 1 = " + admin1); //$NON-NLS-1$
while (it2.hasNext()) {
admin2 = (String) it2.next();
UtilLogger.logInformation("\t\tAt : " + index //$NON-NLS-1$
+ " -- admin2 is : " + admin2); //$NON-NLS-1$
GenUtils.addData(lexFile, String.valueOf(index)
+ ":" + isoCode.getAlpha2() + ":" //$NON-NLS-1$ //$NON-NLS-2$
+ isoCode.getAlpha3() + ":" //$NON-NLS-1$
+ isoCode.getName() + ":" + admin1 + ":" //$NON-NLS-1$ //$NON-NLS-2$
+ admin2 + "\n"); //$NON-NLS-1$
index++;
}
}
adminData.clear();
// We dont need the ISO code object for this country
// anymore, we are done processing it.
isoCode = null;
// Close all open resources.
reader.close();
if (lexFile != null) {
lexFile.close();
lexFile = null;
UtilLogger.logInformation("\t\tGenerated : " + fileName); //$NON-NLS-1$
fileName = null;
}
// Run garbage collection
//System.gc();
}
} catch (IOException e) {
e.printStackTrace();
}
} // for
}
/**
* Collect all level 2 administration data, sort it lexicographically and
* store the result.
*/
protected void printSortedSequences() {
UtilLogger.logInformation("\t\tDescriptors by lexicographic ordering : "); //$NON-NLS-1$
if (adminData == null) {
return;
}
Set<String> keys = adminData.keySet();
Iterator it1 = keys.iterator();
while (it1.hasNext()) {
String admin1 = (String) it1.next();
List<String> admin2List = adminData.get(admin1);
Iterator it2 = admin2List.iterator();
int index = 1;
UtilLogger.logInformation("\t\tAdmin 1 = " + admin1); //$NON-NLS-1$
while (it2.hasNext()) {
UtilLogger.logInformation("\t\tAt : " + index++ + " -- admin2 is : " //$NON-NLS-1$ //$NON-NLS-2$
+ it2.next());
}
}
}
/**
* Find an entry in the lexicographically sorted list and get its index.
*
* @param list
* a list of strings.
* @param entry
* the entry we want to find in the list.
*
* @return the index of the entry in the sorted list (if found).
*/
protected int indexLookup(List<String> list, String entry) {
int index = 1;
Iterator it = list.iterator();
while (it.hasNext()) {
String current = (String) it.next();
if (current.toUpperCase().equals(entry.toUpperCase())) {
return index;
}
index++;
}
// Not found.
UtilLogger.logInformation("\t\tindexLookup : entry not found for " + entry); //$NON-NLS-1$
return -1;
}
/**
* Main execution entry point.
*
* @param args
*/
public static void main(String[] args) {
// final int ISO_3166_1 = 0;
// final int TARGET_LIST = 1;
// final int OUTPUT_DIR = 2;
// final int PARAMS = 3;
//
// if (args.length < PARAMS) {
//
// UtilLogger.logInformation("--Wrong arguments--"); //$NON-NLS-1$
// System.out
// .println("\tTo run, please provide the following arguments : ");
// //$NON-NLS-1$
// UtilLogger.logInformation("\t\t ISO 3166-1 file"); //$NON-NLS-1$
// UtilLogger.logInformation("\t\t Target countries file"); //$NON-NLS-1$
// UtilLogger.logInformation("\t\t Output directory"); //$NON-NLS-1$
// return;
// }
//
// Admin2LexSorter sorter = new Admin2LexSorter(args[ISO_3166_1],
// args[TARGET_LIST], args[OUTPUT_DIR]);
// sorter.run();
final int CONFIG_FILE = 0;
final int PARAMS = 1;
if (args.length < PARAMS) {
UtilLogger.logInformation("\t\t--Wrong arguments--"); //$NON-NLS-1$
System.out
.println("\t\tTo run, please provide the following argument(s) : "); //$NON-NLS-1$
UtilLogger.logInformation("\t\t\t Configuration file"); //$NON-NLS-1$
System.exit(1);
}
Admin2LexSorter sorter = new Admin2LexSorter(args[CONFIG_FILE]);
sorter.run();
}
}