Using Data
This post contains the following topics:
- Working with Arrays
- Loading Distributed Data from a File
- Creating Distributed Arrays from Azure Blobs
- Accessing Data with LINQ
Working with Arrays
You can create either dense n-dimensional arrays or distributed dense n-dimensional arrays using Microsoft codename “Cloud Numerics” lab.
Creating Arrays
You can create dense arrays with Numerics.Local. For example:
using local = Microsoft.Numerics.Local;
var a = local.NumericDenseArrayFactory.
CreateFromSystemArray<double>( new double [,] { {-0.5, 1.0},
{ 0.5, 1.0} } );
Creating Distributed Arrays
You can create distributed dense arrays with Numerics.Distributed. For example:
using dist = Microsoft.Numerics.Distributed;
var c = new dist.NumericDenseArray<double>(a); // Explicit distributed data creation
Casting Arrays
You can cast from a distributed array to a local array. For example:
var d = c.ToLocalArray(); // Implict distributed data recast
You can also assign local data to distributed data. For example:
var a = local.NumericDenseArrayFactory.CreateFromSystemArray<double>( new double [,]
{ {-0.5, 1.0},
{ 0.5, 1.0} } );
dist.NumericDenseArray<double> c = a; // Assignment with backend distributed data
Loading Distributed Data from a File
The “Cloud Numerics” lab provides an interface you can implement for loading data from a file.
The steps to loading distributed data from a file are:
1. Create a class that returns an object that conforms to the Numerics.Distributed.IO.IParallelReader interface or else use or modify the Distributed.IO.CSVLoader class provided in the Cloud Numerics lab distribution.
2. Use the Distributed.IO.Loader.LoadData() method to load your data into a distributed dense array.
For more details, see the blog post titled Using the IParallelReader Interface.
Creating Distributed Arrays from Azure Blobs
For more information on Windows Azure Blob storage, navigate to the following Getting Started page https://www.microsoft.com/windowsazure/learn/get-started/
Creating Serial IO from Blobs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.WindowsAzure.StorageClient;
using msnl = Microsoft.Numerics.Local;
using msnd = Microsoft.Numerics.Distributed;
namespace ExampleWithSerialIO
{
class Program
{
// Sample blobs that hold matrices of random numbers as binary data
static string accountName = @"https://cloudnumericslab.blob.core.windows.net/";
// 1000-by-1000 matrix
static string blobAddress= @”https://cloudnumericslab.blob.core.windows.net/arraycollection/mediummatrix”;
// Method to read blob data and convert it into local NumericDenseArray of doubles
public static msnl.NumericDenseArray<double> ReadBlob()
{
long i,j;
// Get reference to blob
var blobClient = new CloudBlobClient(accountName);
var blob = blobClient.GetBlobReference(blobAddress);
// Read number of rows and columns from blob metadata
blob.FetchAttributes();
long rows = Convert.ToInt64(blob.Metadata["dimension0"]);
long columns = Convert.ToInt64(blob.Metadata["dimension1"]);
// Convert blob binary data to local NumericDenseArray
var outArray = msnl.NumericDenseArrayFactory.Create<double>(new long[] { rows, columns });
var blobData = blob.DownloadByteArray();
for (i = 0; i < rows; i++)
{
for (j = 0; j < columns; j++)
{
outArray[i, j] = BitConverter.ToDouble(blobData, (int)(i * columns + j) * 8);
}
}
return outArray;
}
static void Main()
{
// Initialize runtime
Microsoft.Numerics.NumericsRuntime.Initialize();
// Read data and implicitly cast to distributed array
msnd.NumericDenseArray<double> data = ReadBlob();
// Compute mean of dataset
var mean = Microsoft.Numerics.Statistics.Descriptive.Mean(data);
// Write result. When running on Windows Azure cluster,
// the output is available in job output
Console.WriteLine("Mean of data: {0}", mean);
// Shut down runtime
Microsoft.Numerics.NumericsRuntime.Shutdown();
}
}
}
Creating Distributed IO from Blobs
using System;
using System.Linq;
using msnl = Microsoft.Numerics.Local;
using msnd = Microsoft.Numerics.Distributed;
using Microsoft.Numerics;
using Microsoft.WindowsAzure;
using Microsoft.WindowsAzure.StorageClient;
// A example method for reading an array from blob storage
// Each blob contains a piece of 2-D array
namespace AzureArrayReader
{
[Serializable()]
public class AzureArrayReader : msnd.IO.IParallelReader<double>
{
private string accountName;
private string containerName;
public AzureArrayReader(string accountName,string containerName)
{
this.accountName = accountName;
this.containerName = containerName;
}
// Assign blobs to MPI ranks
public object[] ComputeAssignment(int nranks)
{
Object[] blobs = new Object[nranks];
var blobClient = new CloudBlobClient(accountName);
var matrixContainer = blobClient.GetContainerReference(containerName);
var blobCount = matrixContainer.ListBlobs().Count();
int maxBlobsPerRank = (int)Math.Ceiling((double)blobCount / (double)nranks);
int currentBlob = 0;
for (int i = 0; i < nranks; i++)
{
int step = Math.Max(0,
Math.Min(maxBlobsPerRank,
blobCount - currentBlob) );
blobs[i] = new int[] { currentBlob, step };
currentBlob = currentBlob + step;
}
return (object[])blobs;
}
// Assume pieces are concatenated along column dimension
public int DistributedDimension
{
get { return 1; }
set { }
}
// Read data from blobs
public msnl.NumericDenseArray<double> ReadWorker(Object assignment)
{
var blobClient = new CloudBlobClient(accountName);
var matrixContainer = blobClient.GetContainerReference(containerName);
int[] blobs = (int[])assignment;
long i, j, k;
msnl.NumericDenseArray<double> outArray;
var firstBlob = matrixContainer.GetBlockBlobReference("slab0");
firstBlob.FetchAttributes();
long rows = Convert.ToInt64(firstBlob.Metadata["dimension0"]);
long[] columnsPerSlab = new long[blobs[1]];
if (blobs[1] > 0)
{
// Get blob metadata, validate that each piece has equal number of rows
for (i = 0; i < blobs[1]; i++)
{
var matrixBlob = matrixContainer.GetBlockBlobReference(
"slab" + (blobs[0] + i).ToString());
matrixBlob.FetchAttributes();
if (Convert.ToInt64(matrixBlob.Metadata["dimension0"]) != rows)
{
throw new System.IO.InvalidDataException("Invalid slab shape");
}
columnsPerSlab[i] =
Convert.ToInt64(matrixBlob.Metadata["dimension1"]);
}
// Construct output array
outArray =
msnl.NumericDenseArrayFactory.Create<double>(
new long[] { rows, columnsPerSlab.Sum() } );
// Read data
long columnCounter = 0;
for (i = 0; i < blobs[1]; i++)
{
var matrixBlob =
matrixContainer.GetBlobReference("slab" + (blobs[0] + i).ToString());
var blobData = matrixBlob.DownloadByteArray();
for (j = 0; j < columnsPerSlab[i]; j++)
{
for (k = 0; k < rows; k++)
{
outArray[k, columnCounter] =
BitConverter.ToDouble(blobData, (int)(j * rows + k) * 8);
}
columnCounter = columnCounter + 1;
}
}
}
else
{
// If a rank was assigned zero blobs, return empty array
outArray =
msnl.NumericDenseArrayFactory.Create<double>( new long[] {rows, 0 });
}
return outArray;
}
}
}
Accessing Data with LINQ
This section provides the following examples of how to use the C# LINQ extensions to access array data.
- Extracting Selected Data by Index
- Filtering out NaN Values
Extracting Selected Data by Index
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.Numerics;
using Microsoft.Numerics.Local;
namespace HowToRecipes
{
class LINQtoNDAExtractExample
{
public static void Run()
{
// Create Numeric Dense Array
var numbers = NumericDenseArrayFactory.CreateFromSystemArray<int>(
new int[] { 1, 2, 3, 4, 5, 6 });
// Set indexes of start and end of the part to be extracted
int idxStart = 1;
int idxEnd = 4;
Console.WriteLine("All numbers: {0}", numbers);
Console.WriteLine("Start index: {0}, End index {1}", idxStart, idxEnd);
// Extract
NumericDenseArray<int> outArray =
NumericDenseArrayFactory.CreateFromSystemArray<int>(
numbers
.Where((x, i) => (i >= idxStart && i <= idxEnd))
.ToArray());
Console.WriteLine("Extracted array: {0}", outArray);
}
}
}
Filtering out NaN Values
using System;
using System.Linq;
using System.Collections;
using System.Collections.Generic;
using Microsoft.Numerics;
using Microsoft.Numerics.Local;
namespace HowToRecipes
{
class LINQtoNDATrimNaNsExample
{
public static void Run()
{
// Create Numeric dense array with NaNs
var sampleNan = NumericDenseArrayFactory.CreateFromSystemArray<double>(
new double[] { double.NaN, 1.0, 2.0, 3.0, double.NaN, 4.0, 5.0, 6.0 }
);
Console.WriteLine("Array with NaNs: {0}", sampleNan);
// Trim NaN
var cleanedNDA = NumericDenseArrayFactory.CreateFromSystemArray<double>(
sampleNan
.Where(x => (!double.IsNaN(x)))
.ToArray());
Console.WriteLine("Trimmed array: {0}", cleanedNDA);
}
}
}