/*
 * NCSA Data To Knowledge (D2K) Project
 * Automated Learning Group
 * National Center for Supercomputing Applications
 * University of Illinois at Urbana-Champaign
 * 605 E. Springfield, Champaign, IL 61820
 *
 * d2k@ncsa.uiuc.edu
 *
 * Copyright 2003, Board of Trustees of the University of Illinois,
 * All Rights Reserved
 *
 * NCSA Data To Knowledge (D2K) software, source code, binary code, and Java-byte 
 * code including D2K modules and D2K itineraries (hereafter, Software) is 
 * copyrighted by the Board of Trustees of the University of Illinois (UI),
 * and ownership remains with the UI.
 *
 * Prior to receiving this Software Source Code, You must have agreed to a
 * non-exclusive, non-transferable, restricted License of Software with UI for
 * limited Research Use and/or Internal Business Use. The terms of that License
 * control the use of this Software Source Code. For the sake of convenience, 
 * certain provisions of that License are stated here. However, in the event
 * of any real or perceived differences between the terms of the License and the
 * statements made herein, the License controls.  
 *
 * You may not distribute the Software including the Binary Code and this Source
 * Code to third parties.
 *
 * You may make Derivative Works. You are encouraged to provide information to 
 * UI regarding Derivative Works and your experience with Software. However, if
 * You make any Derivative Work based on or derived from the Software, then You
 * will: (1) clearly notify users that such Derivative Work is a modified version
 * and uses or is derived from the original NCSA Data To Knowledge (D2K)
 * developed at UI, and include specific language in that notice as provided for
 * in the License, and (2) acknowledge via citation and provide UI with a copy of
 * any report or publication using the Software or Derivative Work.
 *
 * If You wish to make Commercial Use of the Software or Derivative Works, then
 * You should contact the UI, c/o NCSA, to negotiate an appropriate license for
 * such Commercial Use. Commercial Use includes sale, lease, license,
 * distribution or otherwise making the Software or Derivative Works available to
 * third parties, which includes, but is not limited to, integration of all or
 * part of the Software or Derivative Work into a product for sale or license by
 * or on behalf of You to third parties.
 *
 * UI MAKES NO REPRESENTATIONS ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY
 * PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY. THE UI
 * SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY THE USERS OF THIS SOFTWARE.
 *
 * By using or copying this Software, You agree to abide by the copyright law
 * and all other applicable laws of the U.S. including, but not limited to,
 * export control laws, and the terms of the License. UI shall have the right
 * to terminate its license with You immediately upon Your breach of, or
 * non-compliance with, any of its terms. You may be held legally responsible
 * for any copyright infringement that is caused or encouraged by Your failure
 * to abide by the terms of the License.
 */

package ncsa.d2k.modules.core.transform.attribute;

import java.util.*;
import ncsa.d2k.core.modules.*;
import ncsa.d2k.modules.core.datatype.table.*;

/**
 * This module examines columns in a <code>MutableTable</code> and, for
 * appropriate columns which contain nominal values, converts these single
 * columns into multiple columns (of <code>boolean</code>s or
 * <code>int</code>s) -- one for each possible value of the attribute.
 * <p>
 * If the input <code>MutableTable</code> implements the
 * <code>ExampleTable</code> interface, only columns marked as inputs and
 * outputs will be converted. Otherwise, all columns containing nominal
 * values will be converted.
 * <p>
 * Through a property of the module, a user can select whether the generated
 * columns are of type <code>boolean</code> or <code>int</code>.
 */
public class ScalarizeNominals extends DataPrepModule {

////////////////////////////////////////////////////////////////////////////////
// Module methods                                                             //
////////////////////////////////////////////////////////////////////////////////

   public String getInputInfo(int index) {
    if (index == 0)
      return "A <i>MutableTable</i> (possibly an <i>ExampleTable</i>).";
    return "NO SUCH INPUT";
   }

   public String getInputName(int index) {
    if (index == 0)
      return "Mutable Table";
    return "NO SUCH INPUT";
   }

   public String[] getInputTypes() {
    return new String[] {
      "ncsa.d2k.modules.core.datatype.table.MutableTable"
    };
   }

   public String getModuleInfo() {
    StringBuffer sb = new StringBuffer("<p>Overview: ");
    sb.append("This module examines columns in a <i>MutableTable</i> and, ");
    sb.append("for appropriate columns which contain nominal values, ");
    sb.append("converts these single columns into multiple columns -- one ");
    sb.append("for each possible value of the attribute.");
    sb.append("</p><p>Detailed Description: ");
    sb.append("If the input <i>MutableTable</i> implements the ");
    sb.append("<i>ExampleTable</i> interface, only columns marked as ");
    sb.append("inputs and outputs will be converted. Otherwise, all ");
    sb.append("columns containing nominal values will be converted. ");
    sb.append("Through a property of the module, the user can select ");
    sb.append("whether the generated columns are double or boolean.");
    sb.append("</p><p>Data Handling: ");
    sb.append("This module modifies its input data; each relevant nominal ");
    sb.append("column may be replaced with an arbitrary number of new ");
    sb.append("ones. In addition, columns with blank labels are assigned ");
    sb.append("default ones.");
    sb.append("</p>");
    return sb.toString();
   }

   public String getModuleName() {
    return "Scalarize Nominals";
   }

   public String getOutputInfo(int index) {
    if (index == 0) {
      StringBuffer sb = new StringBuffer();
      sb.append("The input <i>MutableTable</i> with appropriate nominal ");
      sb.append("columns transformed.");
      return sb.toString();
    }
    return "NO SUCH OUTPUT";
   }

   public String[] getOutputTypes() {
    return new String[] {
      "ncsa.d2k.modules.core.datatype.table.MutableTable"
    };
   }

   public String getOutputName(int index) {
    if (index == 0)
      return "Scalarized Mutable Table";
    return "NO SUCH OUTPUT";
   }

////////////////////////////////////////////////////////////////////////////////
// properties                                                                 //
////////////////////////////////////////////////////////////////////////////////

   private boolean _newTypeBoolean = true;
   public void setNewTypeBoolean(boolean value) { _newTypeBoolean = value; }
   public boolean getNewTypeBoolean() { return _newTypeBoolean; }

   public PropertyDescription[] getPropertiesDescriptions() {

    PropertyDescription newTypeBooleanDesc = new PropertyDescription(
      "newTypeBoolean",
      "Create new columns as type boolean",
      "Controls whether converted nominal columns will have scalar type " +
      "boolean (true) or type double (false).");

    return new PropertyDescription[] { newTypeBooleanDesc };

   }

////////////////////////////////////////////////////////////////////////////////
// doit()                                                                     //
////////////////////////////////////////////////////////////////////////////////

   public void doit() throws Exception {

    MutableTable table = (MutableTable)pullInput(0);

    int[] indices;
    int[] origInputs = null, origOutputs = null;

    // columns with blank labels need to be assigned default ones

    for (int i = 0; i < table.getNumColumns(); i++) {
      String s = table.getColumnLabel(i);
      if (s == null || s.length() == 0)
       table.setColumnLabel("column_" + i, i);
    }

    // determine which columns we wish to transform

    boolean tableIsExample = false;

    if (table instanceof ExampleTable) {

      tableIsExample = true;

      ExampleTable et = (ExampleTable)table;

      origInputs  = new int[et.getInputFeatures().length];
      for (int i = 0; i < origInputs.length; i++)
       origInputs[i] = et.getInputFeatures()[i];

      origOutputs = new int[et.getOutputFeatures().length];
      for (int i = 0; i < origOutputs.length; i++)
       origOutputs[i] = et.getOutputFeatures()[i];

      // ensure unique column indices

      HashMap uniqueIndexMap = new HashMap();

      for (int i = 0; i < origInputs.length; i++)
       if (et.isColumnNominal(origInputs[i]))
         uniqueIndexMap.put(new Integer(origInputs[i]), null);

      for (int i = 0; i < origOutputs.length; i++)
       if (et.isColumnNominal(origOutputs[i]))
         uniqueIndexMap.put(new Integer(origOutputs[i]), null);

      // retrieve column indices

      indices = new int[uniqueIndexMap.size()];
      int index = 0;

      Iterator iterator = uniqueIndexMap.keySet().iterator();
      while (iterator.hasNext())
       indices[index++] = ((Integer)iterator.next()).intValue();

      Arrays.sort(indices);

      // we'll be removing all input and output columns, so set these to
      // empty (at least until ExampleTableImpl handles this properly!)
      /*
      et.setInputFeatures(new int[0]);
      et.setOutputFeatures(new int[0]);
      */

    }
    else {

      // simply iterate to find nominal columns

      int numNominalColumns = 0;

      for (int i = 0; i < table.getNumColumns(); i++)
       if (table.isColumnNominal(i))
         numNominalColumns++;

      indices = new int[numNominalColumns];

      int index = 0;
      for (int i = 0; i < table.getNumColumns(); i++)
       if (table.isColumnNominal(i))
         indices[index++] = i;

    }

    // iterate and replace

    int offset = 0; // number of extra columns added to the table. must be
             // added to column indices in order to keep consistent

    int numRows = table.getNumRows();

    for (int count = 0; count < indices.length; count++) {

      int index = indices[count] + offset;

      // find this column's unique values

      HashMap uniqueValuesMap = new HashMap();
      int uniqueValueCount = 0;

      for (int row = 0; row < numRows; row++) {

       if (table.isValueMissing(row, index))
         continue;
       else if (table.isValueMissing(row, index))
         continue;

       String s = table.getString(row, index);

       if (s == null || s.length() == 0)
         continue;
       if (uniqueValuesMap.containsKey(s))
         continue;

       uniqueValuesMap.put(s, new Integer(uniqueValueCount++));

      }

      if (uniqueValuesMap.size() == 0) { // nothing (or only missing) here
       continue;
      }
      else {

       // first, we'd like our string-to-integer mappings as arrays,
       // for efficiency

       String[] uniqueValues = new String[uniqueValuesMap.size()];
       int[] uniqueValueIndices = new int[uniqueValues.length];

       Iterator iterator = uniqueValuesMap.keySet().iterator();
       int iteratorCount = 0;

       while (iterator.hasNext())
         uniqueValues[iteratorCount++] = (String)iterator.next();

       for (int i = 0; i < uniqueValues.length; i++)
         uniqueValueIndices[i] = ((Integer)uniqueValuesMap.get(
           uniqueValues[i])).intValue();

       // we also want an indirection array so we can act as if these
       // mappings were sorted on the integer value

       int[] indirection = new int[uniqueValueIndices.length];
       for (int i = 0; i < uniqueValueIndices.length; i++)
         indirection[uniqueValueIndices[i]] = i;

       // now create one array for the entire column specifying which
       // unique value is contained in each row. if the value is missing
       // or empty, set to -1.

       int[] match = new int[numRows];
       for (int row = 0; row < numRows; row++) {

         if (table.isValueMissing(row, index) ||
            table.isValueEmpty  (row, index)) {
           match[row] = -1;
           continue;
         }

         String s = table.getString(row, index);

         for (int j = 0; j < uniqueValues.length; j++) {
           if (s.equals(uniqueValues[indirection[j]])) {
            match[row] = j;
            break;
           }
         }

       }

       // !:
       // are we dealing with an ExampleTable? if so, is the old column
       // an input, output, or both?

       boolean isInput = false, isOutput = false;
       if (tableIsExample) {

         ExampleTable et = (ExampleTable)table;

         for (int i = 0; i < origInputs.length; i++) {
           if (origInputs[i] == indices[count]) {
            isInput = true;
            break;
           }
         }

         for (int i = 0; i < origOutputs.length; i++) {
           if (origOutputs[i] == indices[count]) {
            isOutput = true;
            break;
           }
         }

       }

       // remove the old column

       String columnLabel = table.getColumnLabel(index);
       table.removeColumn(index);
       offset--;

       // iterate and create the new columns

       for (int k = 0; k < uniqueValues.length; k++) {

         if (_newTypeBoolean) { // create new columns as type boolean

           boolean[] newColumn = new boolean[numRows];

           for (int row = 0; row < match.length; row++) {
            if (match[row] == k)
              newColumn[row] = true;
            else
              newColumn[row] = false;
           }

           table.insertColumn(newColumn, index + k);
           table.setColumnLabel(columnLabel + "=" +
            uniqueValues[indirection[k]], index + k);

         }
         else { // create new columns as type int

           double [] newColumn = new double[numRows];

           for (int row = 0; row < match.length; row++) {
            if (match[row] == k)
              newColumn[row] = 1;
            else
              newColumn[row] = 0;
           }

           table.insertColumn(newColumn, index + k);
           table.setColumnLabel(columnLabel + "=" +
            uniqueValues[indirection[k]], index + k);

         }

         offset++;

         // !:
         // we now must add this new column to the list of inputs/outputs
         // if we are dealing with an ExampleTable. this isn't very
         // efficient; maybe we should modify the API to handle this
         if (tableIsExample) {

           ExampleTable et = (ExampleTable)table;

           if (isInput) {

            int[] inputs = et.getInputFeatures();
            int[] newInputs = new int[inputs.length + 1];

            for (int i = 0; i < inputs.length; i++)
              newInputs[i] = inputs[i];
            newInputs[inputs.length] = index + k;

            Arrays.sort(newInputs);

            et.setInputFeatures(newInputs);

           }

           if (isOutput) {

            int[] outputs = et.getOutputFeatures();
            int[] newOutputs = new int[outputs.length + 1];

            for (int i = 0; i < outputs.length; i++)
              newOutputs[i] = outputs[i];
            newOutputs[outputs.length] = index + k;

            Arrays.sort(newOutputs);

            et.setOutputFeatures(newOutputs);

           }

         }

       }

      }

    }

    pushOutput(table, 0);

   }

   /*
   private void printInputOutputColumns(ExampleTable et) {

     int[] inputColumns = et.getInputFeatures(),
         outputColumns = et.getOutputFeatures();

     String[] inputNames = et.getInputNames(),
            outputNames = et.getOutputNames();

     System.out.print("actual labels:");
     for (int i = 0; i < et.getNumColumns(); i++)
       System.out.print(" " + i + "(" + et.getColumnLabel(i) + ")");
     System.out.println(" [" + et.getNumColumns() + "]");

     System.out.print("input columns:");
     for (int i = 0; i < inputColumns.length; i++)
       System.out.print(" " + inputColumns[i] + "(" + inputNames[i] + ")");
     System.out.println(" [" + inputColumns.length + "]");

     System.out.print("output columns:");
     for (int i = 0; i < outputColumns.length; i++)
       System.out.print(" " + outputColumns[i] + "(" + outputNames[i] + ")");
     System.out.println(" [" + outputColumns.length + "]");

   }
   */

}