How to get frequent items from transaction.Using data mining algorithms.
using System;
using
System.Collections.Generic;
using
System.Linq;
using
System.Text;
using
System.Threading.Tasks;
namespace
ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
try
{
Console.WriteLine("\nBegin frequent item-set extraction
demo\n");
string[] rawItems = new
string[] { "apples", "bread ", "celery",
"donuts", "eggs ",
"flour ",
"grapes", "honey
", "icing ", "jelly ", "kale ", "lettuce" };
int N = rawItems.Length; //
total number of items to deal with ( [0..11] )
string[][] rawTransactions
= new string[10][];
rawTransactions[0] = new
string[] { "apples", "bread ", "celery",
"flour", "grapes" };
// 0 1 2 5
rawTransactions[1] = new
string[] { "bread ", "eggs
", "flour", "grapes" }; // 1 4 5
rawTransactions[2] = new
string[] { "apples", "bread ", "donuts",
"eggs", "grapes" }; // 0 1 3 4
rawTransactions[3] = new
string[] { "celery", "donuts", "flour ",
"grapes" }; // 2 3 5 6
rawTransactions[4] = new
string[] { "donuts", "eggs
" }; //
3 4
rawTransactions[5] = new
string[] { "donuts", "eggs
", "jelly " };
// 3 4 9
rawTransactions[6] = new
string[] { "apples", "bread ", "donuts",
"icing " }; // 0 1 3 8
rawTransactions[7] = new
string[] { "bread ", "grapes", "honey " }; // 1 6 7
rawTransactions[8] = new
string[] { "apples", "bread ", "celery",
"flour ", "kale " };
// 0 1 2 5 10
rawTransactions[9] = new
string[] { "apples", "bread ", "celery",
"flour " }; // 0 1 2
5
Console.WriteLine("Raw
transactions are:");
Console.WriteLine("-----------------------------------------------");
for (int i = 0; i <
rawTransactions.Length; ++i)
{
Console.Write("[" + i + "] : ");
for (int j = 0; j <
rawTransactions[i].Length; ++j)
Console.Write(rawTransactions[i][j]
+ " ");
Console.WriteLine("");
}
// could do this
programmatically
List
transactions = new List();
transactions.Add(new int[]
{ 0, 1, 2, 5,6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
transactions.Add(new int[]
{ 0, 1, 2, 5, 6 });
Console.WriteLine("\n\nEncoded transactions are:");
Console.WriteLine("-----------------------------------------------");
for (int i = 0; i <
transactions.Count; ++i)
{
Console.Write("[" + i + "] : ");
for (int j = 0; j <
transactions[i].Length; ++j)
Console.Write(transactions[i][j].ToString() + " ");
Console.WriteLine("");
}
Console.WriteLine("");
double minSupportPct = 0.30;
// minimum pct of transactions for an item-set to be 'frequent'
int minItemSetLength = 2;
int maxItemSetLength = 4;
Console.WriteLine("\nSetting minimum frequent support percent =
" +
minSupportPct.ToString("F2"));
Console.WriteLine("Setting minimum frequent item-set length =
" +
minItemSetLength);
Console.WriteLine("Setting
maximum frequent item-set length = " +
maxItemSetLength);
Console.WriteLine("Using Apriori algorithm to construct frequent
item-sets");
// everything happens here
List frequentItemSets =
GetFrequentItemSets(N,
transactions, minSupportPct, minItemSetLength, maxItemSetLength);
Console.WriteLine("\nFrequent item-sets in numeric form
are:");
Console.WriteLine("");
for (int i = 0; i <
frequentItemSets.Count; ++i)
Console.WriteLine(frequentItemSets[i].ToString());
Console.WriteLine("\nFrequent item-sets in string form are:");
Console.WriteLine("");
for (int i = 0; i <
frequentItemSets.Count; ++i)
{
for (int j = 0; j <
frequentItemSets[i].data.Length; ++j)
{
int v =
frequentItemSets[i].data[j];
Console.Write(rawItems[v] + "
");
}
Console.WriteLine("");
}
Console.WriteLine("\nEnd
frequent item-set extraction demo\n");
Console.ReadLine();
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
Console.ReadLine();
}
} // Main
static List
GetFrequentItemSets(int N, List transactions, double
minSupportPct,
int minItemSetLength, int
maxItemSetLength)
{
// create a List of frequent
ItemSet objects that are in transactions
// frequent means occurs in
minSupportPct percent of transactions
// N is total number of items
// uses a variation of the
Apriori algorithm
int minSupportCount =
(int)(transactions.Count * minSupportPct);
Dictionary
frequentDict = new Dictionary(); // key = int representation
of an ItemSet, val = is in List of frequent ItemSet objects
List
frequentList = new List(); // item set objects that meet minimum
count (in transactions) requirement
List validItems =
new List(); // inidividual items/values at any given point in time
to be used to construct new ItemSet (which may or may not meet threshhold
count)
// get counts of all individual
items
int[] counts = new int[N]; //
index is the item/value, cell content is the count
for (int i = 0; i <
transactions.Count; ++i)
{
for (int j = 0; j <
transactions[i].Length; ++j)
{
int v =
transactions[i][j];
++counts[v];
}
}
// for those items that meet
support threshold, add to valid list, frequent list, frequent dict
for (int i = 0; i <
counts.Length; ++i)
{
if (counts[i] >=
minSupportCount) // frequent item
{
validItems.Add(i); // i
is the item/value
int[] d = new int[1];
// the ItemSet ctor wants an array
d[0] = i;
ItemSet ci = new ItemSet(N, d,
1); // an ItemSet with size 1, ct 1
frequentList.Add(ci);
// it's frequent
frequentDict.Add(ci.hashValue, true); //
} // else skip this item
}
bool done = false; // done if
no new frequent item-sets found
for (int k = 2; k <=
maxItemSetLength && done == false; ++k) // construct all size k = 2, 3, 4, . . frequent item-sets
{
done = true; // assume no
new item-sets will be created
int numFreq =
frequentList.Count; // List size modified so store first
for (int i = 0; i <
numFreq; ++i) // use existing frequent item-sets to create new freq item-sets
with size+1
{
if (frequentList[i].k
!= k - 1) continue; // only use those ItemSet objects with size 1 less than new
ones being created
for (int j = 0; j <
validItems.Count; ++j)
{
int[] newData = new
int[k]; // data for a new item-set
for (int p = 0; p
< k - 1; ++p)
newData[p] =
frequentList[i].data[p]; // old data in
if (validItems[j]
<= newData[k - 2]) continue; // because item-values are in order we can skip
sometimes
newData[k - 1] =
validItems[j]; // new item-value
ItemSet ci = new
ItemSet(N, newData, -1); // ct to be determined
if
(frequentDict.ContainsKey(ci.hashValue) == true) // this new ItemSet has
already been added
continue;
int ct =
CountTimesInTransactions(ci, transactions); // how many times is the new
ItemSet in the transactuions?
if (ct >=
minSupportCount) // we have a winner!
{
ci.ct = ct; //
now we know the ct
frequentList.Add(ci);
frequentDict.Add(ci.hashValue, true);
done = false;
// a new item-set was created, so we're not done
}
} // j
} // i
// update valid items --
quite subtle
validItems.Clear();
Dictionary
validDict = new Dictionary(); // track new list of valid items
for (int idx = 0; idx <
frequentList.Count; ++idx)
{
if (frequentList[idx].k
!= k) continue; // only looking at the just-created item-sets
for (int j = 0; j <
frequentList[idx].data.Length; ++j)
{
int v =
frequentList[idx].data[j]; // item
if (validDict.ContainsKey(v)
== false)
{
//Console.WriteLine("adding " + v + " to valid items
list");
validItems.Add(v);
validDict.Add(v, true);
}
}
}
validItems.Sort(); // keep
valid item-values ordered so item-sets will always be ordered
} // next k
// transfer to return result,
filtering by minItemSetCount
List result =
new List();
for (int i = 0; i <
frequentList.Count; ++i)
{
if (frequentList[i].k >=
minItemSetLength)
result.Add(new
ItemSet(frequentList[i].N, frequentList[i].data, frequentList[i].ct));
}
return result;
}
static int
CountTimesInTransactions(ItemSet itemSet, List transactions)
{
// number of times itemSet
occurs in transactions
int ct = 0;
for (int i = 0; i <
transactions.Count; ++i)
{
if
(itemSet.IsSubsetOf(transactions[i]) == true)
++ct;
}
return ct;
}
} // program
public class ItemSet
{
public int N; // data items are
[0..N-1]
public int k; // number of items
public int[] data; // ex: [0 2 5]
public int hashValue; // "0 2
5" -> 520 (for hashing)
public int ct; // num times this
occurs in transactions
public ItemSet(int N, int[] items,
int ct)
{
this.N = N;
this.k = items.Length;
this.data = new int[this.k];
Array.Copy(items, this.data,
items.Length);
this.hashValue =
ComputeHashValue(items);
this.ct = ct;
}
private static int
ComputeHashValue(int[] data)
{
int value = 0;
int multiplier = 1;
for (int i = 0; i <
data.Length; ++i) // actually working backward
{
value = value + (data[i] *
multiplier);
multiplier = multiplier *
10;
}
return value;
}
public override string ToString()
{
string s = "{ ";
for (int i = 0; i <
data.Length; ++i)
s += data[i] + "
";
return s + "}" +
" ct = " + ct; ;
}
public bool IsSubsetOf(int[] trans)
{
// 'trans' is an ordered
transaction like [0 1 4 5 8]
int foundIdx = -1;
for (int j = 0; j <
this.data.Length; ++j)
{
foundIdx = IndexOf(trans,
this.data[j], foundIdx + 1);
if (foundIdx == -1) return
false;
}
return true;
}
private static int IndexOf(int[]
array, int item, int startIdx)
{
for (int i = startIdx; i <
array.Length; ++i)
{
if (i > item) return -1;
// i is past where the target could possibly be
if (array[i] == item)
return i;
}
return -1;
}
}
}
No comments:
Post a Comment