方法: CSV テキスト ファイルの列値を計算する (LINQ)

この例では、.csv ファイルの列に対して、合計、平均、最小値、および最大値などの集計を実行する方法を示します。 この例で示される原則は、別の型の構造化テキストにも適用できます。

ソース ファイルを作成するには

  • 以下の行を scores.csv という名前のファイルにコピーし、ソリューション フォルダーに保存します。 最初の列が学生 ID、残りの列が 4 つの試験の点数を表すとします。

    111, 97, 92, 81, 60
    112, 75, 84, 91, 39
    113, 88, 94, 65, 91
    114, 97, 89, 85, 82
    115, 35, 72, 91, 70
    116, 99, 86, 90, 94
    117, 93, 92, 80, 87
    118, 92, 90, 83, 78
    119, 68, 79, 88, 92
    120, 99, 82, 81, 79
    121, 96, 85, 91, 60
    122, 94, 92, 91, 91
    

使用例

    Class SumColumns

        Public Shared Sub Main()

            Dim lines As String() = System.IO.File.ReadAllLines("../../../scores.csv")

            ' Specifies the column to compute 
            ' This value could be passed in at runtime. 
            Dim exam = 3

            ' Spreadsheet format: 
            ' Student ID    Exam#1  Exam#2  Exam#3  Exam#4 
            ' 111,          97,     92,     81,     60 
            ' one is added to skip over the first column 
            ' which holds the student ID.
            SumColumn(lines, exam + 1)
            Console.WriteLine()
            MultiColumns(lines)

            ' Keep the console window open in debug mode.
            Console.WriteLine("Press any key to exit...")
            Console.ReadKey()

        End Sub 

        Shared Sub SumColumn(ByVal lines As IEnumerable(Of String), ByVal col As Integer)

            ' This query performs two steps: 
            ' split the string into a string array 
            ' convert the specified element to 
            ' integer and select it. 
            Dim columnQuery = From line In lines 
                               Let x = line.Split(",") 
                               Select Convert.ToInt32(x(col))

            ' Execute and cache the results for performance. 
            ' Only needed with very large files. 
            Dim results = columnQuery.ToList()

            ' Perform aggregate calculations  
            ' on the column specified by col. 
            Dim avgScore = Aggregate score In results Into Average(score)
            Dim minScore = Aggregate score In results Into Min(score)
            Dim maxScore = Aggregate score In results Into Max(score)

            Console.WriteLine("Single Column Query:")
            Console.WriteLine("Exam #{0}: Average:{1:##.##} High Score:{2} Low Score:{3}", 
                         col, avgScore, maxScore, minScore)


        End Sub 

        Shared Sub MultiColumns(ByVal lines As IEnumerable(Of String))

            Console.WriteLine("Multi Column Query:")

            ' Create the query. It will produce nested sequences.  
            ' multiColQuery performs these steps: 
            ' 1) convert the string to a string array 
            ' 2) skip over the "Student ID" column and take the rest 
            ' 3) convert each field to an int and select that  
            '    entire sequence as one row in the results. 
            Dim multiColQuery = From line In lines 
                                Let fields = line.Split(",") 
                                Select From str In fields Skip 1 
                                            Select Convert.ToInt32(str)

            Dim results = multiColQuery.ToList()

            ' Find out how many columns we have. 
            Dim columnCount = results(0).Count()

            ' Perform aggregate calculations on each column.             
            ' One loop for each score column in scores. 
            ' We can use a for loop because we have already 
            ' executed the multiColQuery in the call to ToList. 

            For j As Integer = 0 To columnCount - 1
                Dim column = j
                Dim res2 = From row In results 
                           Select row.ElementAt(column)

                ' Perform aggregate calculations  
                ' on the column specified by col. 
                Dim avgScore = Aggregate score In res2 Into Average(score)
                Dim minScore = Aggregate score In res2 Into Min(score)
                Dim maxScore = Aggregate score In res2 Into Max(score)

                ' Add 1 to column numbers because exams in this course start with #1
                Console.WriteLine("Exam #{0} Average: {1:##.##} High Score: {2} Low Score: {3}", 
                                  column + 1, avgScore, maxScore, minScore)

            Next 
        End Sub 

    End Class 
    ' Output: 
    ' Single Column Query: 
    ' Exam #4: Average:76.92 High Score:94 Low Score:39 

    ' Multi Column Query: 
    ' Exam #1 Average: 86.08 High Score: 99 Low Score: 35 
    ' Exam #2 Average: 86.42 High Score: 94 Low Score: 72 
    ' Exam #3 Average: 84.75 High Score: 91 Low Score: 65 
    ' Exam #4 Average: 76.92 High Score: 94 Low Score: 39
class SumColumns
{
    static void Main(string[] args)
    {
        string[] lines = System.IO.File.ReadAllLines(@"../../../scores.csv");

        // Specifies the column to compute. 
        int exam = 3;

        // Spreadsheet format: 
        // Student ID    Exam#1  Exam#2  Exam#3  Exam#4 
        // 111,          97,     92,     81,     60 

        // Add one to exam to skip over the first column, 
        // which holds the student ID.
        SingleColumn(lines, exam + 1);
        Console.WriteLine();
        MultiColumns(lines);

        Console.WriteLine("Press any key to exit");
        Console.ReadKey();
    }

    static void SingleColumn(IEnumerable<string> strs, int examNum)
    {
        Console.WriteLine("Single Column Query:");

        // Parameter examNum specifies the column to  
        // run the calculations on. This value could be 
        // passed in dynamically at runtime.              

        // Variable columnQuery is an IEnumerable<int>. 
        // The following query performs two steps: 
        // 1) use Split to break each row (a string) into an array  
        //    of strings,  
        // 2) convert the element at position examNum to an int 
        //    and select it. 
        var columnQuery =
            from line in strs
            let elements = line.Split(',')
            select Convert.ToInt32(elements[examNum]);

        // Execute the query and cache the results to improve 
        // performance. This is helpful only with very large files. 
        var results = columnQuery.ToList();

        // Perform aggregate calculations Average, Max, and 
        // Min on the column specified by examNum. 
        double average = results.Average();
        int max = results.Max();
        int min = results.Min();

        Console.WriteLine("Exam #{0}: Average:{1:##.##} High Score:{2} Low Score:{3}",
                 examNum, average, max, min);
    }

    static void MultiColumns(IEnumerable<string> strs)
    {
        Console.WriteLine("Multi Column Query:");

        // Create a query, multiColQuery. Explicit typing is used 
        // to make clear that, when executed, multiColQuery produces  
        // nested sequences. However, you get the same results by 
        // using 'var'. 

        // The multiColQuery query performs the following steps: 
        // 1) use Split to break each row (a string) into an array  
        //    of strings,  
        // 2) use Skip to skip the "Student ID" column, and store the 
        //    rest of the row in scores. 
        // 3) convert each score in the current row from a string to 
        //    an int, and select that entire sequence as one row  
        //    in the results.
        IEnumerable<IEnumerable<int>> multiColQuery =
            from line in strs
            let elements = line.Split(',')
            let scores = elements.Skip(1)
            select (from str in scores
                    select Convert.ToInt32(str));

        // Execute the query and cache the results to improve 
        // performance.  
        // ToArray could be used instead of ToList. 
        var results = multiColQuery.ToList();

        // Find out how many columns you have in results. 
        int columnCount = results[0].Count();

        // Perform aggregate calculations Average, Max, and 
        // Min on each column.             
        // Perform one iteration of the loop for each column  
        // of scores. 
        // You can use a for loop instead of a foreach loop  
        // because you already executed the multiColQuery  
        // query by calling ToList. 
        for (int column = 0; column < columnCount; column++)
        {
            var results2 = from row in results
                           select row.ElementAt(column);
            double average = results2.Average();
            int max = results2.Max();
            int min = results2.Min();

            // Add one to column because the first exam is Exam #1, 
            // not Exam #0.
            Console.WriteLine("Exam #{0} Average: {1:##.##} High Score: {2} Low Score: {3}",
                          column + 1, average, max, min);
        }
    }
}
/* Output:
    Single Column Query:
    Exam #4: Average:76.92 High Score:94 Low Score:39

    Multi Column Query:
    Exam #1 Average: 86.08 High Score: 99 Low Score: 35
    Exam #2 Average: 86.42 High Score: 94 Low Score: 72
    Exam #3 Average: 84.75 High Score: 91 Low Score: 65
    Exam #4 Average: 76.92 High Score: 94 Low Score: 39
 */

このクエリは、Split メソッドを使用してテキストの各行を配列に変換します。 各配列の要素は、列を表します。 最後に、各列のテキストは数値表現に変換されます。 ファイルがタブ区切りファイルの場合は、Split メソッドの引数を \t に変更するだけです。

コードのコンパイル

  • .NET Framework Version 3.5 を対象とする Visual Studio プロジェクトを作成します。 既定のプロジェクトには、System.Core.dll への参照と、System.Linq 名前空間に対する using ディレクティブ (C#) または Imports ステートメント (Visual Basic) が含まれます。

  • このコードをプロジェクト内にコピーします。

  • F5 キーを押して、プログラムをコンパイルおよび実行します。

  • 任意のキーを押して、コンソール ウィンドウを終了します。

参照

概念

LINQ と文字列

LINQ とファイル ディレクトリ