Commit 1df5d6a2 authored by David Thompson's avatar David Thompson
Browse files

Methods to compute and return unique array values.

Determining unique values is the "abstract" (or categorical)
equivalent of determining the range of interval-valued array.
It only makes sense for arrays with a small number of distinct
values, and this also happens to be when it is computationally
efficient to compute.

This adds methods `GetUniqueComponentValues` and
`UpdateDiscreteValueSet` to `vtkAbstractArray`.
The `GetUniqueComponentValues` method populates a `vtkVariantArray`
you pass with a list of unique values taken on by a component, or,
clears it to an empty array if there are more than
`vtkAbstractArray::MAX_DISCRETE_VALUES` distinct values present.

The unique values are identified by sampling at most log2(n)
tuples for large arrays, so obtaining the summary information --
which is cached once computed -- is an inexpensive operation.
This does mean that rare observations will go undetected.

You may also obtain a list of unique *tuples* instead of unique
component values by passing -1 for the component number.

Unique values are cached in the `vtkInformation` associated with the array.
This commit also adds 2 new key types: one for storing `vtkVariant`
objects as values, and another for storing vectors of variants.
A variant vector is used to store unique tuples and unique component values.

Change-Id: I241cf61372966bba381eb28d14bdc043a54ba197
parent 091f0a6f
......@@ -92,6 +92,8 @@ SET(Module_SRCS
vtkInformationStringKey.cxx
vtkInformationStringVectorKey.cxx
vtkInformationUnsignedLongKey.cxx
vtkInformationVariantKey
vtkInformationVariantVectorKey
vtkInformationVector.cxx
vtkInstantiator.cxx
vtkIntArray.cxx
......@@ -395,6 +397,8 @@ set_source_files_properties(
vtkInformationStringKey
vtkInformationStringVectorKey
vtkInformationUnsignedLongKey
vtkInformationVariantKey
vtkInformationVariantVectorKey
vtkObjectBase
vtkObjectFactory
vtkOldStyleCallbackCommand
......
......@@ -10,6 +10,7 @@ create_test_sourcelist(Tests ${vtk-module}CxxTests.cxx
TestArrayLookup.cxx
TestArrayNullValues.cxx
TestArraySize.cxx
TestArrayUniqueValueDetection.cxx
TestArrayUserTypes.cxx
TestArrayVariants.cxx
TestCollection.cxx
......
#include "vtkAbstractArray.h"
#include "vtkDoubleArray.h"
#include "vtkIntArray.h"
#include "vtkNew.h"
#include "vtkStringArray.h"
#include "vtkVariantArray.h"
static double testTuplesDouble[] = {
// A simple test where every component *and* the tuples as a whole behave discretely.
0., 0., 1.,
1., 0., 3.,
0., 2., 7.,
0., 0., 9.,
1., 0., 1.,
0., 2., 8.,
};
// Bounds on the number of unique values that should be identified.
// For larger data, the number may not be precise as we sample subsets of the data.
static int numUniqueDouble[] = {
2, 2,
2, 2,
5, 5,
6, 6,
};
static int testTuplesInt[] = {
/*
These next tuples repeat 16 coordinate values but are frequently not
identical in both components to other tuples. Thus the components
should have 16 discrete values reported, but the vectors as
a whole should not since there are 43 distinct tuples.
Furthermore, the array is sampled as opposed to exhastively enumerated
and so only 15 unique values are detected for component 1.
*/
16, 1,
8, 14,
10, 3,
11, 4,
2, 13,
7, 12,
6, 5,
15, 9,
15, 6,
9, 7,
11, 16,
1, 5,
2, 3,
13, 12,
4, 8,
14, 10,
4, 14,
11, 9,
7, 3,
8, 2,
12, 13,
1, 6,
15, 10,
16, 5,
4, 10,
12, 3,
5, 8,
13, 1,
14, 11,
2, 6,
15, 9,
7, 16,
1, 2,
5, 3,
16, 13,
15, 9,
11, 12,
7, 14,
8, 10,
4, 6,
8, 13,
16, 14,
15, 2,
11, 1,
3, 10,
4, 6,
7, 12,
5, 9,
};
static int numUniqueInt[] = {
1, 16,
1, 16,
0, 0,
};
static int testTuplesInt2[] = {
// There are no repeats in this set.
1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33,
};
static int numUniqueInt2[] = {
0, 0,
};
static vtkStdString testTuplesString[] = {
/*
To test the log(N) sampling strategy, we must
have more than 64 entries in the array.
In practice, we need even more as the "cache line"
optimization searches blocks of 8 strings at a time.
This array has 230 values and we should test
max(ceil(log2(230)),min(230,2*32)) = 64 tuples.
*/
"Skeenie", "Beeny", "Piny", "Po", "Po", // not detected.
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
"Eeny", "Meeny", "Miny", "Mo", "Miny",
};
static int numUniqueString[2] = {
4, 8,
};
static bool CheckUniques(vtkAbstractArray* arr, int* uniqueSizeBds)
{
bool ok = true;
if (!arr)
return ok;
cout << arr->GetName() << endl;
int nc = arr->GetNumberOfComponents();
vtkNew<vtkVariantArray> uniques;
for (int c = 0; c < nc; ++c)
{
arr->GetUniqueComponentValues(c, uniques.GetPointer());
int numUniques = static_cast<int>(uniques->GetNumberOfTuples());
cout << " comp " << c << " (" << numUniques << "): ";
for (int n = 0; n <= uniques->GetMaxId(); ++n)
{
cout << (n > 0 ? ", " : " ") << uniques->GetVariantValue(n).ToString().c_str();
}
cout << endl;
if (uniqueSizeBds[2 * c] > numUniques || uniqueSizeBds[2 * c + 1] < numUniques)
{
cout
<< " ** ERROR: Expected between " << uniqueSizeBds[2 * c]
<< " and " << uniqueSizeBds[2 * c + 1] << " values\n";
ok = false;
}
}
if (nc > 1)
{
arr->GetUniqueComponentValues(-1, uniques.GetPointer());
int numUniques = static_cast<int>(uniques->GetNumberOfTuples());
cout << " tuples (" << numUniques << "): ";
for (int n = 0; n <= uniques->GetMaxId(); ++n)
{
cout << (n > 0 && n % nc == 0 ? ", " : " ") << uniques->GetVariantValue(n).ToString().c_str();
}
cout << endl;
if (uniqueSizeBds[2 * nc] > numUniques || uniqueSizeBds[2 * nc + 1] < numUniques)
{
cout
<< " ** ERROR: Expected between " << uniqueSizeBds[2 * nc]
<< " and " << uniqueSizeBds[2 * nc + 1] << " values\n";
ok = false;
}
}
return ok;
}
int TestArrayUniqueValueDetection(int vtkNotUsed(argc), char* vtkNotUsed(argv)[])
{
bool ok = true;
vtkNew<vtkDoubleArray> darr;
vtkNew<vtkIntArray> iarr;
vtkNew<vtkStringArray> sarr;
darr->SetNumberOfComponents(3);
darr->SetArray(testTuplesDouble, sizeof(testTuplesDouble)/sizeof(testTuplesDouble[0]), 1);
darr->SetName("Some3DPoints");
ok &= CheckUniques(darr.GetPointer(), numUniqueDouble);
iarr->SetNumberOfComponents(2);
iarr->SetArray(testTuplesInt, sizeof(testTuplesInt)/sizeof(testTuplesInt[0]), 1);
iarr->SetName("Some2DPoints");
ok &= CheckUniques(iarr.GetPointer(), numUniqueInt);
iarr->SetNumberOfComponents(1);
iarr->SetArray(testTuplesInt2, sizeof(testTuplesInt2)/sizeof(testTuplesInt2[0]), 1);
//iarr->Modified(); // required since we have changed the tuples?
iarr->SetName("Some1DPoints");
ok &= CheckUniques(iarr.GetPointer(), numUniqueInt2);
sarr->SetNumberOfComponents(1);
sarr->SetArray(testTuplesString, sizeof(testTuplesString)/sizeof(testTuplesString[0]), 1);
sarr->SetName("SomeNonWords");
ok &= CheckUniques(sarr.GetPointer(), numUniqueString);
return ok ? 0 : 1;
}
......@@ -23,6 +23,7 @@
#include "vtkInformation.h"
#include "vtkIntArray.h"
#include "vtkLongArray.h"
#include "vtkMinimalStandardRandomSequence.h"
#include "vtkShortArray.h"
#include "vtkSignedCharArray.h"
#include "vtkStringArray.h"
......@@ -32,7 +33,12 @@
#include "vtkUnsignedLongArray.h"
#include "vtkUnsignedShortArray.h"
#include "vtkVariantArray.h"
#include "vtkInformationVector.h"
#include "vtkInformationIntegerKey.h"
#include "vtkInformationInformationVectorKey.h"
#include "vtkInformationVariantVectorKey.h"
#include <vtkNew.h>
#include "vtkUnicodeString.h" // for vtkSuperExtraExtendedTemplateMacro
#if defined(VTK_TYPE_USE_LONG_LONG)
# include "vtkLongLongArray.h"
......@@ -46,7 +52,12 @@
# endif
#endif
#include <set>
#include <cmath>
vtkInformationKeyMacro(vtkAbstractArray, GUI_HIDE, Integer);
vtkInformationKeyMacro(vtkAbstractArray, PER_COMPONENT, InformationVector);
vtkInformationKeyMacro(vtkAbstractArray, DISCRETE_VALUES, VariantVector);
namespace
{
......@@ -249,14 +260,22 @@ void vtkAbstractArray::DeepCopy( vtkAbstractArray* da )
}
//----------------------------------------------------------------------------
int vtkAbstractArray::CopyInformation(vtkInformation *infoFrom, int deep)
int vtkAbstractArray::CopyInformation(vtkInformation* infoFrom, int deep)
{
// Copy all keys. NOTE: subclasses rely on this.
vtkInformation *myInfo=this->GetInformation();
vtkInformation* myInfo=this->GetInformation();
myInfo->Copy(infoFrom,deep);
// remove any keys we own that are not to be coppied
// here.
// Remove any keys we own that are not to be copied here.
// For now, remove per-component metadata.
if (myInfo->Has(PER_COMPONENT()))
{
myInfo->Remove(PER_COMPONENT());
}
if (myInfo->Has(DISCRETE_VALUES()))
{
myInfo->Remove(DISCRETE_VALUES());
}
return 1;
}
......@@ -461,3 +480,227 @@ void vtkAbstractArray::InsertVariantValue(vtkIdType id, vtkVariant value)
}
this->SetVariantValue(id, value);
}
//--------------------------------------------------------------------------
void vtkAbstractArray::GetUniqueComponentValues(
int comp, vtkVariantArray* values)
{
if (!values || comp < -1)
return;
values->Initialize();
values->SetNumberOfComponents(comp < 0 ? this->NumberOfComponents : 1);
bool justCreated = false;
vtkInformation* info = this->GetInformation();
if (comp >= 0 && info)
{
vtkInformationVector* infoVec = info->Get(PER_COMPONENT());
if (!infoVec || infoVec->GetNumberOfInformationObjects() < this->NumberOfComponents)
{
infoVec = vtkInformationVector::New();
infoVec->SetNumberOfInformationObjects(this->NumberOfComponents);
info->Set(PER_COMPONENT(), infoVec);
infoVec->FastDelete();
justCreated = true;
}
info = infoVec->GetInformationObject(comp);
}
if (info)
{
// Recompute discrete value set when the array has been
// modified since the information was written.
if (this->GetMTime() > info->GetMTime() || justCreated)
{
this->UpdateDiscreteValueSet();
}
}
else
{
return;
}
vtkIdType len;
const vtkVariant* vals = info->Get(DISCRETE_VALUES());
if ((vals = info->Get(DISCRETE_VALUES())) != NULL)
{
len = info->Length(DISCRETE_VALUES());
values->SetNumberOfTuples(len / values->GetNumberOfComponents());
for (vtkIdType i = 0; i < len; ++i)
{
values->SetVariantValue(i, vals[i]);
}
}
}
//--------------------------------------------------------------------------
template<typename T>
bool AccumulateSampleValues(
T* array, int nc, vtkIdType begin, vtkIdType end,
std::vector<std::set<vtkVariant> >& uniques)
{
// number of discrete components remaining (tracked during iteration):
int ndc = nc;
std::pair<std::set<vtkVariant>::iterator,bool> result;
// Here we iterate over the components and add to their respective lists
// of previously encountered values -- as long as there are not too many
// values already in the list. We also accumulate each component's value
// into a vtkVariantArray named tuple, which is added to the list of
// unique vectors -- again assuming it is not already too long.
for (vtkIdType i = begin; i < end && ndc; ++i)
{
// First, attempt a per-component insert.
for (int j = 0; j < nc; ++ j)
{
if (uniques[j].size() > vtkAbstractArray::MAX_DISCRETE_VALUES)
continue;
result = uniques[j].insert(array[i * nc + j]);
//cout << i << " " << j << ": " << array[i*nc+j] << " " << (result.second ? "t" : "f") << endl;
if (result.second)
{
if (uniques[j].size() == vtkAbstractArray::MAX_DISCRETE_VALUES + 1)
{
-- ndc;
}
}
}
// Now, as long as no component has exceeded MAX_DISCRETE_VALUES unique
// values, it is worth seeing whether the tuple as a whole is unique:
if ( nc > 1 && ndc == nc )
{
vtkNew<vtkVariantArray> tuple;
tuple->SetNumberOfComponents(nc);
tuple->SetNumberOfTuples(1);
for (int j = 0; j < nc; ++j)
{
tuple->SetVariantValue(j, array[i * nc + j]);
}
vtkVariant wholeThing(tuple.GetPointer());
result = uniques[nc].insert(wholeThing);
}
}
return ndc == 0;
}
//--------------------------------------------------------------------------
void vtkAbstractArray::UpdateDiscreteValueSet()
{
// For an array with T tuples, we sample N blocks of M tuples each, with
// M*N = f(T) and f some sublinear function of T.
// If every component plus all components taken together each have more than
// MAX_DISCRETE_VALUES distinct values, then we exit early.
// M is chosen based on the number of bytes per tuple to maximize use of a cache
// line (assuming a 64-byte cache line until kwsys::SystemInformation or the like
// can provide a platform-independent way to query it).
// N is chosen to satisfy M*N = max(ceil(log(T)),min(2*MAX_DISCRETE_VALUES,T))
#define VTK_CACHE_LINE_SIZE 64
int nc = this->NumberOfComponents;
int blockSize = VTK_CACHE_LINE_SIZE / (this->GetDataTypeSize() * nc);
if (!blockSize) blockSize = 4;
int ln2 = 0;
vtkIdType nt = this->GetNumberOfTuples();
if (this->MaxId > 0) frexp(static_cast<double>(nt), &ln2);
vtkIdType numberOfSampleTuples = (ln2 <= 0 ? 1 : ln2);
vtkIdType numberOfBlocks =
numberOfSampleTuples / blockSize +
(numberOfSampleTuples % blockSize ? 1 : 0);
if (numberOfBlocks * blockSize < 2 * MAX_DISCRETE_VALUES)
{
numberOfBlocks =
2 * MAX_DISCRETE_VALUES / blockSize +
(2 * MAX_DISCRETE_VALUES % blockSize ? 1 : 0);
}
//std::vector<vtkNew<vtkVariantArray> > uniques(nc + 1);
std::vector<std::set<vtkVariant> > uniques(nc > 1 ? nc + 1 : nc);
if (numberOfBlocks * blockSize > this->MaxId / 2)
{ // Awwww, just do the whole array already!
switch(this->GetDataType())
{
vtkSuperExtraExtendedTemplateMacro(
AccumulateSampleValues(
static_cast<VTK_TT*>(this->GetVoidPointer(0)),
nc, 0, nt, uniques));
default:
vtkErrorMacro("Array type " << this->GetClassName() << " not supported.");
break;
}
}
else
{ // Choose random blocks
vtkNew<vtkMinimalStandardRandomSequence> seq;
// test different blocks each time we're called:
seq->SetSeed(seq->GetMTime() ^ 0xdeadbeef);
vtkIdType totalBlockCount =
nt / blockSize +
(nt % blockSize ? 1 : 0);
for (int i = 0; i < numberOfBlocks; ++ i, seq->Next())
{
vtkIdType startTuple =
static_cast<vtkIdType>(seq->GetValue() * totalBlockCount) * blockSize;
vtkIdType endTuple = startTuple + blockSize;
endTuple = endTuple < nt ? endTuple : nt;
bool endEarly;
switch(this->GetDataType())
{
vtkSuperExtraExtendedTemplateMacro(
endEarly = AccumulateSampleValues(
static_cast<VTK_TT*>(this->GetVoidPointer(0)),
nc, startTuple, endTuple, uniques));
default:
vtkErrorMacro("Array type " << this->GetClassName() << " not supported.");
endEarly = true;
break;
}
if (endEarly)
break;
}
}
int c;
for (c = 0; c < nc; ++c)
{
if (uniques[c].size() <= MAX_DISCRETE_VALUES)
{
std::vector<vtkVariant> compUniques;
std::set<vtkVariant>::iterator sit;
for (sit = uniques[c].begin(); sit != uniques[c].end(); ++sit)
{
//cout << " " << *sit << endl;
compUniques.push_back(*sit);
}
this->GetInformation()->Get(
PER_COMPONENT())->GetInformationObject(c)->Set(
DISCRETE_VALUES(), &compUniques[0],
static_cast<int>(compUniques.size()));
}
else
{
this->GetInformation()->Get(
PER_COMPONENT())->GetInformationObject(c)->Remove(
DISCRETE_VALUES());
}
}
if (nc > 1 && uniques[nc].size() <= MAX_DISCRETE_VALUES)
{
std::vector<vtkVariant> compUniques;
std::set<vtkVariant>::iterator sit;
for (sit = uniques[nc].begin(); sit != uniques[nc].end(); ++sit)
{
//cout << " " << *sit << endl;
vtkAbstractArray* tuple = sit->ToArray();
if (tuple)
{
for (c = 0; c <= tuple->GetMaxId(); ++c)
{
compUniques.push_back(tuple->GetVariantValue(c));
}
}
}
this->GetInformation()->Set(
DISCRETE_VALUES(), &compUniques[0],
static_cast<int>(compUniques.size()));
}
else
{ // Remove the key
this->GetInformation()->Remove(DISCRETE_VALUES());
}
}
......@@ -51,6 +51,9 @@ class vtkIdList;
class vtkIdTypeArray;
class vtkInformation;
class vtkInformationIntegerKey;
class vtkInformationInformationVectorKey;
class vtkInformationVariantVectorKey;
class vtkVariantArray;
class VTKCOMMONCORE_EXPORT vtkAbstractArray : public vtkObject
{
......@@ -314,6 +317,22 @@ public:
// function.
virtual void ClearLookup() = 0;
// Description:
// Populate the given vtkVariantArray with a set of unique values taken on
// by the requested component (or, when passed -1, by the tuples as a whole).
// If the set of unique values has more than 32 entries, then the array
// is assumed to be continuous in nature and no values are returned.
//
// The first time this is called, the array is examined and unique values
// are stored in the vtkInformation object associated with the array.
// The list of unique values will be updated on subsequent calls only if
// the array's MTime is newer than the associated vtkInformation object.
//
// Note that this set of returned values may not be complete; in order to
// perform interactively, a subsample of the array is used to determine the
// set of values.
virtual void GetUniqueComponentValues(int comp, vtkVariantArray* values);
// TODO: Implement these lookup functions also.
//virtual void LookupRange(vtkVariant min, vtkVariant max, vtkIdList* ids,
// bool includeMin = true, bool includeMax = true) = 0;
......@@ -348,16 +367,51 @@ public:
// is internal and should not be shown to the end user.
static vtkInformationIntegerKey* GUI_HIDE();
// Description:
// This key is used to hold a vector of COMPONENT_VALUES (and, for
// vtkDataArray subclasses, COMPONENT_RANGE) keys -- one
// for each component of the array. You may add additional per-component