Commit 2b8f4f60 authored by Jason Shepherd's avatar Jason Shepherd
Browse files

ENH: Adding the initial version of a TextAnalysis kit to VTK. Requires Boost...

ENH: Adding the initial version of a TextAnalysis kit to VTK.  Requires Boost and NWay Arrays to be on in CMake, as well as VTK_USE_TEXT_ANALYSIS.  Credit goes to Tim Shead and many of the other SNL developers.
parent 34650085
......@@ -266,6 +266,7 @@ MARK_AS_ADVANCED(VTK_USE_METAIO)
# Determine the set of VTK kits that should be built.
OPTION(VTK_USE_RENDERING "Build the vtkRendering kit. Needed for displaying data or using widgets." ON)
OPTION(VTK_USE_INFOVIS "Build the vtkInfovis kit. Needed for performing information visualization." ON)
OPTION(VTK_USE_TEXT_ANALYSIS "Build the vtkTextAnalysis kit. Needed for performing text analysis." OFF)
VTK_DEPENDENT_OPTION(VTK_USE_PARALLEL "Build the vtkParallel kit." OFF "" OFF)
VTK_DEPENDENT_OPTION(VTK_USE_VIEWS "Build the vtkViews kit. Needed for creating packaged and linked views." ON
......@@ -363,6 +364,10 @@ IF(VTK_USE_INFOVIS)
SET(VTK_KITS ${VTK_KITS} INFOVIS)
ENDIF(VTK_USE_INFOVIS)
IF(VTK_USE_TEXT_ANALYSIS)
SET(VTK_KITS ${VTK_KITS} TEXT_ANALYSIS)
ENDIF(VTK_USE_TEXT_ANALYSIS)
IF(VTK_USE_GEOVIS)
SET(VTK_KITS ${VTK_KITS} GEOVIS)
ENDIF(VTK_USE_GEOVIS)
......@@ -1148,6 +1153,9 @@ ENDIF(VTK_USE_PARALLEL)
IF(VTK_USE_INFOVIS)
ADD_SUBDIRECTORY(Infovis)
ENDIF(VTK_USE_INFOVIS)
IF(VTK_USE_TEXT_ANALYSIS)
ADD_SUBDIRECTORY(TextAnalysis)
ENDIF(VTK_USE_TEXT_ANALYSIS)
IF(VTK_USE_GEOVIS)
ADD_SUBDIRECTORY(Geovis)
ENDIF(VTK_USE_GEOVIS)
......@@ -1192,6 +1200,9 @@ IF(BUILD_TESTING)
IF(VTK_USE_INFOVIS)
ADD_SUBDIRECTORY(Infovis/Testing)
ENDIF(VTK_USE_INFOVIS)
IF(VTK_USE_TEXT_ANALYSIS)
ADD_SUBDIRECTORY(TextAnalysis/Testing)
ENDIF(VTK_USE_TEXT_ANALYSIS)
IF(VTK_USE_GEOVIS)
ADD_SUBDIRECTORY(Geovis/Testing)
ENDIF(VTK_USE_GEOVIS)
......@@ -1234,6 +1245,9 @@ IF(NOT VTK_INSTALL_NO_DEVELOPMENT)
IF(VTK_USE_INFOVIS)
SET(__inst_files ${__inst_files} ${VTK_BINARY_DIR}/vtkInfovisInstantiator.h)
ENDIF(VTK_USE_INFOVIS)
IF(VTK_USE_TEXT_ANALYSIS)
SET(__inst_files ${__inst_files} ${VTK_BINARY_DIR}/vtkTextAnalysisInstantiator.h)
ENDIF(VTK_USE_TEXT_ANALYSIS)
IF(VTK_USE_GEOVIS)
SET(__inst_files ${__inst_files} ${VTK_BINARY_DIR}/vtkGeovisInstantiator.h)
ENDIF(VTK_USE_GEOVIS)
......
......@@ -211,6 +211,12 @@ Do_not_include_vtkWin32Header_directly__vtkSystemIncludes_includes_it;
#define VTK_RENDERING_EXPORT __declspec( dllimport )
#endif
#if defined(vtkTextAnalysis_EXPORTS)
#define VTK_TEXT_ANALYSIS_EXPORT __declspec( dllexport )
#else
#define VTK_TEXT_ANALYSIS_EXPORT __declspec( dllimport )
#endif
#if defined(vtkVolumeRendering_EXPORTS)
#define VTK_VOLUMERENDERING_EXPORT __declspec( dllexport )
#else
......@@ -252,6 +258,7 @@ Do_not_include_vtkWin32Header_directly__vtkSystemIncludes_includes_it;
#define VTK_INFOVIS_EXPORT
#define VTK_IO_EXPORT
#define VTK_RENDERING_EXPORT
#define VTK_TEXT_ANALYSIS_EXPORT
#define VTK_VOLUMERENDERING_EXPORT
#define VTK_HYBRID_EXPORT
#define VTK_WIDGETS_EXPORT
......
IF(NOT VTK_USE_N_WAY_ARRAYS)
MESSAGE(SEND_ERROR "VTK_USE_TEXT_ANALYSIS requires VTK_USE_N_WAY_ARRAYS")
ENDIF(NOT VTK_USE_N_WAY_ARRAYS)
IF(NOT VTK_USE_BOOST)
MESSAGE(SEND_ERROR "VTK_USE_TEXT_ANALYSIS requires VTK_USE_BOOST")
ENDIF(NOT VTK_USE_BOOST)
SET(KIT TextAnalysis)
SET(UKIT TEXT_ANALYSIS)
SET(KIT_TCL_LIBS
vtkIOTCL
vtkFilteringTCL
)
SET(KIT_PYTHON_LIBS
vtkIOPythonD
vtkFilteringPythonD
)
SET(KIT_JAVA_LIBS
vtkIOJava
vtkFilteringJava
)
SET(KIT_CS_LIBS
vtkIOCS
vtkFilteringCS
)
SET(KIT_LIBS
vtkIO
vtkFiltering
)
SET( Kit_SRCS
vtkDocumentReader.cxx
vtkDocumentTextExtraction.cxx
vtkFileExtensionMimeTypeStrategy.cxx
vtkFoldCase.cxx
vtkMimeTypes.cxx
vtkMimeTypeStrategy.cxx
vtkNGramExtraction.cxx
vtkTermDictionary.cxx
vtkTextAnalysisUtility.cxx
vtkTokenizer.cxx
vtkTokenLengthFilter.cxx
vtkTokenValueFilter.cxx
)
SET_SOURCE_FILES_PROPERTIES(
vtkMimeTypeStrategy.cxx
ABSTRACT
)
SET_SOURCE_FILES_PROPERTIES(
vtkTextAnalysisUtility.cxx
WRAP_EXCLUDE
)
SET(Kit_EXTRA_SRCS)
SET(Kit_EXTRA_CMDS)
SET(Kit_TCL_EXTRA_SRCS)
SET(Kit_PYTHON_EXTRA_SRCS)
SET(Kit_JAVA_EXTRA_SRCS)
SET(KIT_TCL_DEPS)
SET(KIT_PYTHON_DEPS)
SET(KIT_JAVA_DEPS)
#-----------------------------------------------------------------------------
# Include CMake code common to all kits.
INCLUDE(${VTK_CMAKE_DIR}/KitCommonBlock.cmake)
#-----------------------------------------------------------------------------
SUBDIRS(Cxx)
IF (VTK_WRAP_TCL)
SUBDIRS(Tcl)
ENDIF (VTK_WRAP_TCL)
#IF (VTK_WRAP_PYTHON)
# SUBDIRS(Python)
#ENDIF (VTK_WRAP_PYTHON)
IF(PYTHON_EXECUTABLE)
ADD_TEST(HeaderTesting-TextAnalysis ${PYTHON_EXECUTABLE}
${VTK_SOURCE_DIR}/Common/Testing/HeaderTesting.py
"${VTK_SOURCE_DIR}/TextAnalysis"
VTK_TEXT_ANALYSIS_EXPORT
vtkDocumentReader.h
vtkTextAnalysisUtility.h
)
ENDIF(PYTHON_EXECUTABLE)
ADD_TEST(PrintSelf-TextAnalysis ${TCL_TCLSH}
${VTK_SOURCE_DIR}/Common/Testing/Tcl/PrintSelfCheck.tcl
${VTK_SOURCE_DIR}/TextAnalysis)
ADD_TEST(TestSetObjectMacro-TextAnalysis ${TCL_TCLSH}
${VTK_SOURCE_DIR}/Common/Testing/Tcl/FindString.tcl
"${VTK_SOURCE_DIR}/TextAnalysis/vtk\\\\*.h"
"vtkSetObjectMacro"
)
/*=========================================================================
Program: Visualization Toolkit
Module: vtkDocumentReader.cxx
Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
All rights reserved.
See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
This software is distributed WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the above copyright notice for more information.
=========================================================================*/
/*----------------------------------------------------------------------------
Copyright (c) Sandia Corporation
See Copyright.txt or http://www.paraview.org/HTML/Copyright.html for details.
----------------------------------------------------------------------------*/
#include <vtkArrayData.h>
#include <vtkCommand.h>
#include <vtkDataSetAttributes.h>
#include <vtkDenseArray.h>
#include <vtkDocumentReader.h>
#include <vtkIdTypeArray.h>
#include <vtkInformation.h>
#include <vtkInformationVector.h>
#include <vtkMimeTypes.h>
#include <vtkObjectFactory.h>
#include <vtkSmartPointer.h>
#include <vtkStringArray.h>
#include <vtkTable.h>
#include <vtkstd/algorithm>
#include <vtkstd/iterator>
#include <vtkstd/stdexcept>
#include <vtkstd/string>
#include <vtkstd/vector>
#include <boost/algorithm/string/replace.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <vtksys/ios/sstream>
class vtkDocumentReader::Internals
{
public:
// Converts a filesystem path to a URI
static const vtkStdString PathToURI(const vtkStdString& path)
{
vtkStdString result = path;
// Get rid of leading and trailing whitespace ...
boost::trim(result);
// Make Windoze slashes straighten-up and fly right ...
boost::replace_all(result, "\\", "/");
// Ensure that Windoze paths are absolute paths ...
if(result.size() > 1 && result.at(1) == ':')
result = "/" + result;
result = "file://" + result;
return result;
}
vtkstd::vector<vtkStdString> Files;
vtkstd::vector<vtkIdType> ID;
};
vtkCxxRevisionMacro(vtkDocumentReader, "1.1");
vtkStandardNewMacro(vtkDocumentReader);
vtkDocumentReader::vtkDocumentReader() :
Implementation(new Internals()),
DefaultMimeType(0)
{
this->SetNumberOfInputPorts(0);
this->SetNumberOfOutputPorts(2);
}
vtkDocumentReader::~vtkDocumentReader()
{
this->SetDefaultMimeType(0);
delete this->Implementation;
}
void vtkDocumentReader::PrintSelf(ostream& os, vtkIndent indent)
{
this->Superclass::PrintSelf(os, indent);
os << indent << "DefaultMimeType: " << (this->DefaultMimeType ? this->DefaultMimeType : "(none)") << "\n";
for(vtkIdType i = 0; static_cast<unsigned int>(i) != this->Implementation->Files.size(); ++i)
{
os << indent << "File: " << this->Implementation->Files[i] << "\n";
}
}
void vtkDocumentReader::AddFile(const char* file)
{
if(!file)
return;
this->AddFile(vtkStdString(file));
}
void vtkDocumentReader::AddFile(const vtkStdString& file)
{
this->AddFile(file, this->Implementation->Files.size());
}
void vtkDocumentReader::AddFile(const vtkStdString& file, const vtkIdType id)
{
this->Implementation->Files.push_back(file);
this->Implementation->ID.push_back(id);
this->Modified();
}
void vtkDocumentReader::ClearFiles()
{
this->Implementation->Files.clear();
this->Implementation->ID.clear();
this->Modified();
}
int vtkDocumentReader::FillOutputPortInformation(int port, vtkInformation* information)
{
switch(port)
{
case 0:
information->Set(vtkDataObject::DATA_TYPE_NAME(), "vtkTable");
return 1;
case 1:
information->Set(vtkDataObject::DATA_TYPE_NAME(), "vtkArrayData");
return 1;
}
return 0;
}
int vtkDocumentReader::RequestData(
vtkInformation* vtkNotUsed(request),
vtkInformationVector** vtkNotUsed(inputVector),
vtkInformationVector* outputVector)
{
try
{
vtkSmartPointer<vtkMimeTypes> mime_types = vtkSmartPointer<vtkMimeTypes>::New();
vtkIdTypeArray* const document_array = vtkIdTypeArray::New();
document_array->SetName("document");
vtkStringArray* const uri_array = vtkStringArray::New();
uri_array->SetName("uri");
vtkStringArray* const mime_type_array = vtkStringArray::New();
mime_type_array->SetName("mime_type");
vtkStringArray* const content_array = vtkStringArray::New();
content_array->SetName("content");
int number_of_files = this->Implementation->Files.size();
for(vtkIdType i = 0; static_cast<unsigned int>(i) != this->Implementation->Files.size(); ++i)
{
const vtkStdString file = this->Implementation->Files[i];
const vtkIdType document = this->Implementation->ID[i];
const vtkStdString uri = Internals::PathToURI(file);
vtkStdString mime_type = mime_types->Lookup(file);
if(mime_type.empty() && this->DefaultMimeType)
mime_type = this->DefaultMimeType;
ifstream file_stream(file.c_str());
vtkstd::stringstream contents;
contents << file_stream.rdbuf();
document_array->InsertNextValue(document);
uri_array->InsertNextValue(uri);
mime_type_array->InsertNextValue(mime_type);
content_array->InsertNextValue(contents.str());
//emit event progress...
double progress = static_cast<double>(i) / static_cast<double>(number_of_files);
this->InvokeEvent(vtkCommand::ProgressEvent, &progress);
}
vtkTable* const output_table = vtkTable::GetData(outputVector, 0);
output_table->AddColumn(document_array);
output_table->AddColumn(uri_array);
output_table->AddColumn(mime_type_array);
output_table->AddColumn(content_array);
output_table->GetRowData()->SetPedigreeIds(document_array);
document_array->Delete();
uri_array->Delete();
mime_type_array->Delete();
content_array->Delete();
vtkDenseArray<vtkIdType>* const output_document_count_array = vtkDenseArray<vtkIdType>::New();
output_document_count_array->Resize(1);
output_document_count_array->SetValue(0, this->Implementation->Files.size());
vtkArrayData* const output_document_count = vtkArrayData::GetData(outputVector, 1);
output_document_count->ClearArrays();
output_document_count->AddArray(output_document_count_array);
output_document_count_array->Delete();
}
catch(vtkstd::exception& e)
{
vtkErrorMacro(<< "unhandled exception: " << e.what());
return 0;
}
catch(...)
{
vtkErrorMacro(<< "unknown exception");
return 0;
}
return 1;
}
/*=========================================================================
Program: Visualization Toolkit
Module: vtkDocumentReader.h
-------------------------------------------------------------------------
Copyright 2008 Sandia Corporation.
Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
the U.S. Government retains certain rights in this software.
-------------------------------------------------------------------------
Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
All rights reserved.
See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
This software is distributed WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the above copyright notice for more information.
=========================================================================*/
#ifndef __vtkDocumentReader_h
#define __vtkDocumentReader_h
#include <vtkTableAlgorithm.h>
// .NAME vtkDocumentReader - Reads documents into memory for text analysis.
//
// .SECTION Description
// Reads zero-to-many documents into memory, producing a vtkTable suitable
// for use as an input to other VTK text analysis filters.
//
// Parameters:
// "Files": a collection of filesystem paths to be loaded.
//
// Outputs:
// Output port 0: A vtkTable containing "document", "uri", "mime_type",
// and "contents" columns.
// Output port 1: A 1D vtkDenseArray<vtkIdType> storing the number of documents
// as a single value.
//
// The output "document" column will contain a zero-based integer document index.
//
// .SECTION Caveats
// As a workaround, vtkDocumentReader stores the contents of each document
// in the "contents" column, which is a vtkStdString array. Note that the
// contents of a document may actually be binary data, so check the MIME-Type
// before treating the contents as a string.
//
// .SECTION Thanks
// Developed by Timothy M. Shead (tshead@sandia.gov) at Sandia National Laboratories.
class vtkStdString;
class VTK_TEXT_ANALYSIS_EXPORT vtkDocumentReader :
public vtkTableAlgorithm
{
public:
static vtkDocumentReader* New();
vtkTypeRevisionMacro(vtkDocumentReader, vtkTableAlgorithm);
void PrintSelf(ostream& os, vtkIndent indent);
// Description:
// Add a file to be loaded.
void AddFile(const char* file);
void AddFile(const vtkStdString& file);
// Description:
// Clear the list of files to be loaded.
void ClearFiles();
// Description:
// Specifies a default MIME type that will be assigned to files whose MIME type
// can't otherwise be identified. Set this to "text/plain" if you want to analyze
// files that would otherwise be ignored (such as files without a known file
// extension, files without any file extension, etc). Default: empty string.
vtkSetStringMacro(DefaultMimeType);
vtkGetStringMacro(DefaultMimeType);
//BTX
protected:
vtkDocumentReader();
~vtkDocumentReader();
virtual int FillOutputPortInformation(int, vtkInformation*);
virtual int RequestData(
vtkInformation* request,
vtkInformationVector** inputVector,
vtkInformationVector* outputVector);
private:
vtkDocumentReader(const vtkDocumentReader &); // Not implemented.
void operator=(const vtkDocumentReader &); // Not implemented.
friend class vtkPDocumentReader;
void AddFile(const vtkStdString& file, const vtkIdType id);
class Internals;
Internals* const Implementation;
char* DefaultMimeType;
//ETX
};
#endif // __vtkDocumentReader_h
/*=========================================================================
Program: Visualization Toolkit
Module: vtkDocumentTextExtraction.cxx
Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
All rights reserved.
See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
This software is distributed WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the above copyright notice for more information.
=========================================================================*/
/*----------------------------------------------------------------------------
Copyright (c) Sandia Corporation
See Copyright.txt or http://www.paraview.org/HTML/Copyright.html for details.
----------------------------------------------------------------------------*/
#include <vtkCommand.h>
#include <vtkDocumentTextExtraction.h>
#include <vtkIdTypeArray.h>
#include <vtkInformation.h>
#include <vtkObjectFactory.h>
#include <vtkSmartPointer.h>
#include <vtkStringArray.h>
#include <vtkTable.h>
#include <vtkUnicodeStringArray.h>
#include <stdexcept>
vtkCxxRevisionMacro(vtkDocumentTextExtraction, "1.1");
vtkStandardNewMacro(vtkDocumentTextExtraction);
vtkDocumentTextExtraction::vtkDocumentTextExtraction()
{
this->SetInputArrayToProcess(0, 0, 0, 6, "mime_type");
this->SetInputArrayToProcess(1, 0, 0, 6, "content");
this->SetNumberOfInputPorts(1);
}
vtkDocumentTextExtraction::~vtkDocumentTextExtraction()
{
}
void vtkDocumentTextExtraction::PrintSelf(ostream& os, vtkIndent indent)
{
this->Superclass::PrintSelf(os, indent);
}
int vtkDocumentTextExtraction::RequestData(
vtkInformation* vtkNotUsed(request),
vtkInformationVector** inputVector,
vtkInformationVector* outputVector)
{
try
{
vtkTable* const input_table = vtkTable::GetData(inputVector[0]);
if(!input_table)
throw vtkstd::runtime_error("missing input table");
vtkStringArray* const mime_type_array = vtkStringArray::SafeDownCast(
this->GetInputAbstractArrayToProcess(0, 0, inputVector));
if(!mime_type_array)
throw vtkstd::runtime_error("missing mime_type array");
vtkStringArray* const content_array = vtkStringArray::SafeDownCast(
this->GetInputAbstractArrayToProcess(1, 0, inputVector));
if(!content_array)
throw vtkstd::runtime_error("missing content array");
vtkUnicodeStringArray* const text_array = vtkUnicodeStringArray::New();
text_array->SetName("text");
int count = mime_type_array->GetNumberOfTuples();
for(vtkIdType i = 0; i != mime_type_array->GetNumberOfTuples(); ++i)
{
const vtkStdString& mime_type = mime_type_array->GetValue(i);
const vtkStdString& content = content_array->GetValue(i);
// If it's a text document, just copy the data ...
if(0 == mime_type.find("text/"))
{
text_array->InsertNextUTF8Value(content.c_str());
}
// Can't identify the file type, so assume there's no text in it ...
else
{
text_array->InsertNextValue(vtkUnicodeString());
}
if( i % 100 == 0 )
{
//emit progress...
double progress = static_cast<double>(i) / static_cast<double>(count);
this->InvokeEvent(vtkCommand::ProgressEvent, &progress);
}
}
vtkTable* const output_table = vtkTable::GetData(outputVector);
output_table->ShallowCopy(input_table);
output_table->AddColumn(text_array);
text_array->Delete();
}
catch(vtkstd::exception& e)
{
vtkErrorMacro(<< "unhandled exception: " << e.what());
return 0;
}
catch(...)