View
8
Download
0
Category
Preview:
Citation preview
11/10/2015
CIS 660: Data Mining Project
Dhruv Patel
CSU ID: 2652790
Darshan Pathak
CSU ID: 2640944
Instructor:
Dr. Sunnie S Chung
Cleveland State University
Page 1
Project description:
Text Mining on Web Documents www.infoplease.com/t/hist/state-of-the-union/
This download contains the text for 219 State of the Union addresses of U.S.
Presidents between 1790and 2006.
Once data is successfully stored, we can build Document Frequency and Inverted
Index on Information Retrieval to build Cosine similarity and build weight matrix
by calculating weighted score based on tf-idf.
Project Member:-
Dhruv Patel: 2652790
Darshan Pathak: 2640944
Tools and Technologies:-
SDK: C#.Net
Nature of Data: SQL Data
Cleveland State University
Page 2
1. Data Gathering
In this step we removed tedious job by going to the website and copy all the
data and save into the notepad file, we just write simple C# code which
going to the website and collect the div of the body element and store the
data in the SQL Data structure.
Program.cs
using System;
using System.Collections.Generic;
using System.Data.SqlClient;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace FetchHtml
{
class Program
{
static string urlAddress = "http://www.infoplease.com/t/hist/state-of-the-union/";
static void Main(string[] args)
{
List<LinkData> lstLinks = new List<LinkData>();
List<string> lstText = new List<string>();
string data = getHTMLString(urlAddress);
lstLinks = StripHTMLLinks(data);
foreach (var link in lstLinks)
{
var strHtml = getHTMLString(link.URL);
//lstText.Add(StripHTMLText(strHtml));
using (SqlConnection con = new SqlConnection("Data Source=FRNDZ;Initial
Catalog=StateOfTheUnion;Integrated Security=True"))
{
try
{
con.Open();
SqlCommand sqlStatement = new SqlCommand("Insert into tblHistory values ('"
+ link.DocName + "','" + StripHTMLText(strHtml) + "');", con); ;
Cleveland State University
Page 3
sqlStatement.ExecuteNonQuery();
con.Close();
}
catch (Exception e)
{
}
}
}
Console.Read();
}
public static string getHTMLString(string url)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
string data = string.Empty;
if (response.StatusCode == HttpStatusCode.OK)
{
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = null;
if (response.CharacterSet == null)
{
readStream = new StreamReader(receiveStream);
}
else
{
readStream = new StreamReader(receiveStream,
Encoding.GetEncoding(response.CharacterSet));
}
data = readStream.ReadToEnd();
response.Close();
readStream.Close();
}
return data;
}
public static List<LinkData> StripHTMLLinks(string str)
{
List<LinkData> lstHistory = new List<LinkData>();
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(str);
doc.DocumentNode.Descendants().Where(n => n.Name == "div" &&
n.GetAttributeValue("class", "") ==
Cleveland State University
Page 4
"toc").ToList().FirstOrDefault().DescendantNodes().Where(n => n.Name == "a").Where(n =>
!string.IsNullOrEmpty(n.GetAttributeValue("href", "")) &&
!string.IsNullOrEmpty(n.InnerText)).ToList().ForEach(n => lstHistory.Add(new LinkData(
n.InnerText, "http://www.infoplease.com/t/hist/state-of-the-union/" +
n.GetAttributeValue("href", ""))));
return lstHistory;
}
public static string StripHTMLText(string str)
{
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(str);
return doc.DocumentNode.Descendants().Where(n => n.Name == "div" &&
(n.GetAttributeValue("class", "") == "article" || n.GetAttributeValue("class", "") ==
"section")).FirstOrDefault().InnerText.Replace('\n', ' ').ToString().Replace(',', '
').ToString().Replace('\'', ' ');
}
}
public class LinkData
{
public string DocName = string.Empty;
public string URL = string.Empty;
public LinkData(string docname,string text){
DocName = docname;
URL = text;
}
}
}
Cleveland State University
Page 5
Fig 1. Here we can see that the SQL Data which we load through the program.
Cleveland State University
Page 6
2. Text Cleaning
In the Text Cleaning Section we just removed Common word by the
programing, below the common word list which we removed from the text.
"a","about","above","across","after","afterwards","again","against","all","almost","alone"
,"along","already","also","although","always","am","among","amongst","amount","an","
and","another","any","anyhow","anyone","anything","anyway","anywhere","are","aroun
d","as","at","back","be","became","because","become","becomes","becoming","been","b
efore","beforehand","behind","being","below","beside","besides","between","beyond","b
ill","both","bottom","but","by","call","can","cannot","cant","co","computer","con","coul
d","couldnt","cry","de","describe","detail","do","done","down","due","during","each","e
g","eight","either","eleven","else","elsewhere","empty","enough","etc","even","ever","ev
ery","everyone","everything","everywhere","except","few","fifteen","fify","fill","find","f
ire","first","five","for","former","formerly","forty","found","four","from","front","full","f
urther","get","give","go","had","has","have","he","hence","her","here","hereafter","hereb
y","herein","hereupon","hers","herself","him","himself","his","how","however","hundred
","i","ie","if","in","inc","indeed","interest","into","is","it","its","itself","keep","last","latte
r","latterly","least","less","ltd","made","many","may","me","meanwhile","might","mill",
"mine","more","moreover","most","mostly","move","much","must","my","myself","nam
e","namely","neither","never","nevertheless","next","nine","no","nobody","none","nor","
not","nothing","now","nowhere","of","off","often","on","once","one","only","onto","or",
"other","others","otherwise","our","ours","ourselves","out","over","own","part","per","pe
rhaps","please","put","rather","re","same","see","seem","seemed","seeming","seems","se
rious","several","she","should","show","side","since","sincere","six","sixty","so","some",
"somehow","someone","something","sometime","sometimes","somewhere","still","such"
,"system","take","ten","than","that","the","their","them","themselves","then","thence","th
ere","thereafter","thereby","therefore","therein","thereupon","these","they","thick","thin"
,"third","this","those","though","three","through","throughout","thru","thus","to","togeth
er","too","top","toward","towards","twelve","twenty","two","un","under","until","up","u
pon","us","very","via","was","we","well","were","what","whatever","when","whence","
whenever","where","whereafter","whereas","whereby","wherein","whereupon","whereve
r","whether","which","while","whither","who","whoever","whole","whom","whose","wh
y","will","with","within","without","would","yet","you","your","yours","yourself","yours
elves"
Cleveland State University
Page 7
3. Calculate Frequency Count , document weight , weight matrix and
Cosine matrix :
Tf-idf Weight values :
Here, = and =
Cleveland State University
Page 8
Common.cs
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; namespace TextMining { public static class Common { public static Dictionary<string, bool> CommonWords = new Dictionary<string, bool> {{ "a", true },{ "about", true },{ "above", true },{ "across", true },{ "after", true },{ "afterwards", true },{ "again", true },{ "against", true },{ "all", true },{ "almost", true },{ "alone", true },{ "along", true },{ "already", true },{ "also", true },{ "although", true },{ "always", true },{ "am", true },{ "among", true },{ "amongst", true },{ "amount", true },{ "an", true },{ "and", true },{ "another", true },{ "any", true },{ "anyhow", true },{ "anyone", true },{ "anything", true },{ "anyway", true },{ "anywhere", true },{ "are", true },{ "around", true },{ "as", true },{ "at", true },{ "back", true },{ "be", true },{ "became", true },{ "because", true },{ "become", true },{ "becomes", true },{ "becoming", true },{ "been", true },{ "before", true },{ "beforehand", true },{ "behind", true },{ "being", true },{ "below", true },{ "beside", true },{ "besides", true },{ "between", true },{ "beyond", true },{ "bill", true },{ "both", true },{ "bottom", true },{ "but", true },{ "by", true },{ "call", true },{ "can", true },{ "cannot", true },{ "cant", true },{ "co", true },{ "computer", true },{ "con", true },{ "could", true },{ "couldnt", true },{ "cry", true },{ "de", true },{ "describe", true },{ "detail", true },{ "do", true },{ "done", true },{ "down", true },{ "due", true },{ "during", true },{ "each", true },{ "eg", true },{ "eight", true },{ "either", true },{ "eleven", true },{ "else", true },{ "elsewhere", true },{ "empty", true },{ "enough", true },{ "etc", true },{ "even", true },{ "ever", true },{ "every", true },{ "everyone", true },{ "everything", true },{ "everywhere", true },{ "except", true },{ "few", true },{ "fifteen", true },{ "fify", true },{ "fill", true },{ "find", true },{ "fire", true },{ "first", true },{ "five", true },{ "for", true },{ "former", true },{ "formerly", true },{ "forty", true },{ "found", true },{ "four", true },{ "from", true },{ "front", true },{ "full", true },{ "further", true },{ "get", true },{ "give", true },{ "go", true },{ "had", true },{ "has", true
Cleveland State University
Page 9
},{ "have", true },{ "he", true },{ "hence", true },{ "her", true },{ "here", true },{ "hereafter", true },{ "hereby", true },{ "herein", true },{ "hereupon", true },{ "hers", true },{ "herself", true },{ "him", true },{ "himself", true },{ "his", true },{ "how", true },{ "however", true },{ "hundred", true },{ "i", true },{ "ie", true },{ "if", true },{ "in", true },{ "inc", true },{ "indeed", true },{ "interest", true },{ "into", true },{ "is", true },{ "it", true },{ "its", true },{ "itself", true },{ "keep", true },{ "last", true },{ "latter", true },{ "latterly", true },{ "least", true },{ "less", true },{ "ltd", true },{ "made", true },{ "many", true },{ "may", true },{ "me", true },{ "meanwhile", true },{ "might", true },{ "mill", true },{ "mine", true },{ "more", true },{ "moreover", true },{ "most", true },{ "mostly", true },{ "move", true },{ "much", true },{ "must", true },{ "my", true },{ "myself", true },{ "name", true },{ "namely", true },{ "neither", true },{ "never", true },{ "nevertheless", true },{ "next", true },{ "nine", true },{ "no", true },{ "nobody", true },{ "none", true },{ "nor", true },{ "not", true },{ "nothing", true },{ "now", true },{ "nowhere", true },{ "of", true },{ "off", true },{ "often", true },{ "on", true },{ "once", true },{ "one", true },{ "only", true },{ "onto", true },{ "or", true },{ "other", true },{ "others", true },{ "otherwise", true },{ "our", true },{ "ours", true },{ "ourselves", true },{ "out", true },{ "over", true },{ "own", true },{ "part", true },{ "per", true },{ "perhaps", true },{ "please", true },{ "put", true },{ "rather", true },{ "re", true },{ "same", true },{ "see", true },{ "seem", true },{ "seemed", true },{ "seeming", true },{ "seems", true },{ "serious", true },{ "several", true },{ "she", true },{ "should", true },{ "show", true },{ "side", true },{ "since", true },{ "sincere", true },{ "six", true },{ "sixty", true },{ "so", true },{ "some", true },{ "somehow", true },{ "someone", true },{ "something", true },{ "sometime", true },{ "sometimes", true },{ "somewhere", true },{ "still", true },{ "such", true },{ "system", true },{ "take", true },{ "ten", true },{ "than", true },{ "that", true },{ "the", true },{ "their", true },{ "them", true },{ "themselves", true },{ "then", true },{ "thence", true },{ "there", true },{ "thereafter", true },{ "thereby", true },{ "therefore", true },{ "therein", true },{ "thereupon", true },{ "these", true },{ "they", true },{ "thick", true },{ "thin", true },{ "third", true },{ "this", true },{ "those", true },{ "though", true },{ "three", true },{ "through", true },{ "throughout", true },{ "thru", true },{ "thus", true },{ "to", true },{ "together", true },{ "too", true },{ "top", true },{ "toward", true },{ "towards", true },{ "twelve", true },{ "twenty", true },{ "two", true },{ "un", true },{ "under", true },{ "until", true },{ "up", true },{ "upon", true },{ "us", true },{ "very", true },{ "via", true },{ "was", true },{ "we", true },{ "well", true },{ "were", true },{ "what", true },{ "whatever", true },{ "when", true },{ "whence", true },{ "whenever",
Cleveland State University
Page 10
true },{ "where", true },{ "whereafter", true },{ "whereas", true },{ "whereby", true },{ "wherein", true },{ "whereupon", true },{ "wherever", true },{ "whether", true },{ "which", true },{ "while", true },{ "whither", true },{ "who", true },{ "whoever", true },{ "whole", true },{ "whom", true },{ "whose", true },{ "why", true },{ "will", true },{ "with", true },{ "within", true },{ "without", true },{ "would", true },{ "yet", true },{ "you", true },{ "your", true },{ "yours", true },{ "yourself", true },{ "yourselves", true } }; public static char[] Splitters = new char[] { ' ',',',';','.','(',')' }; } }
TexMining.cs
using System;
using System.Collections.Generic;
using System.Data.SqlClient;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace TextMining
{
class KeyWordDetails
{
public string Key = string.Empty;
public int df = 0;
public KeyWordDetails(string key, int value)
{
Key = key;
df = value;
}
}
class DocumentDetail
{
public string DocName = string.Empty;
public string Text = string.Empty;
Cleveland State University
Page 11
public DocumentDetail(string dn, string txt)
{
DocName = dn;
Text = txt;
}
}
class TextMining
{
static string[] aryKeyWord;
static string DocPath = @"../../Documents/";
static List<KeyWordDetails> lstdft = new List<KeyWordDetails>();
static double N=0;
static Dictionary<string, Dictionary<string, int>> keyWordCount = new Dictionary<string,
Dictionary<string, int>>();
static Dictionary<string, Dictionary<string, double>> WeightMatrix = new
Dictionary<string, Dictionary<string, double>>();
static Dictionary<string, Dictionary<string, double>> NormMatrix = new
Dictionary<string, Dictionary<string, double>>();
static Dictionary<string, double> ScoreVecor = new Dictionary<string, double>();
static string strTex = string.Empty;
static int KeywordCount=10;
static void Main(string[] args)
{
List<DocumentDetail> aryDoc = new List<DocumentDetail>();
using (SqlConnection con = new SqlConnection("Data Source=FRNDZ;Initial
Catalog=StateOfTheUnion;Integrated Security=True"))
{
try
{
con.Open();
SqlCommand sqlStatement = new SqlCommand("Select * from tblHistory;", con); ;
SqlDataReader reader= sqlStatement.ExecuteReader();
while (reader.Read())
{
aryDoc.Add(new DocumentDetail(((string)reader["DocName"]).Replace(',',' '),
((string)reader["Text"]).ToLower()));
}
con.Close();
}
catch (Exception e)
{
Console.WriteLine("Some error occurred.");
}
}
Console.WriteLine("Text Cleanng & Fetch Keyword list : started at "+DateTime.Now);
Cleveland State University
Page 12
aryKeyWord = GetKeyWordList(string.Join(" ", aryDoc.Select(s => s.Text)));
Console.WriteLine("Text Cleanng & Fetch Keyword list : completed at " +
DateTime.Now);
strTex = "," + String.Join(",", aryKeyWord) + "\n";
foreach (var item in aryKeyWord)
{
lstdft.Add(new KeyWordDetails(item.Trim(), 0));
}
N = aryDoc.Count;
#region Counting keyword counts for all documents
Console.WriteLine("Calculate Keyword count : started at " + DateTime.Now);
for (int i = 0; i < aryDoc.Count; i++)
{
var keyval = keyWordCount.Where(a => a.Key ==
aryDoc[i].DocName).ToList().Count > 0 ? aryDoc[i].DocName + "1" : aryDoc[i].DocName;
strTex += keyval+ ",";
keyWordCount.Add(keyval, CalculateCounts(aryDoc[i].Text.ToLower()));
strTex += "\n";
}
Console.WriteLine("Calculate Keyword count : completed at " + DateTime.Now);
#endregion
File.WriteAllText(DocPath + "Count.csv", strTex);
strTex = "Keywords,df values\n";
strTex += string.Join("\n",lstdft.Select(s=>s.Key+","+s.df));
File.WriteAllText(DocPath + "DocumentFreq.csv", strTex);
Console.WriteLine("Calculate Weight value : started at " + DateTime.Now);
strTex = "," + String.Join(",", aryKeyWord) + ",,Score Value" + "\n";
foreach (var doc in keyWordCount)
{
strTex += doc.Key + ",";
WeightMatrix.Add(doc.Key, CalculateWeightVal(doc.Key,doc.Value));
strTex += "\n";
}
Console.WriteLine("\n\nTop 10 pages using scoere : :");
Console.WriteLine(string.Join("\n", ScoreVecor.OrderByDescending(s =>
s.Value).ToList().Select(s => s.Key).Take(10)));
Console.WriteLine("\n\n");
Console.WriteLine("Calculate Weight value : completed at " + DateTime.Now);
File.WriteAllText(DocPath + "WeightMatrix.csv", strTex);
Console.WriteLine("Constructing Cosine Matrix : started at " + DateTime.Now);
strTex = "," + String.Join(",", aryKeyWord) + "\n";
foreach (var doc in WeightMatrix)
{
Cleveland State University
Page 13
strTex += doc.Key + ",";
NormMatrix.Add(doc.Key, CalculateNormalizeVal(doc.Value));
}
File.WriteAllText(DocPath + "NormMatrix.csv", strTex);
#region Calculate Norm Cosine values
var aryDocOutput = new double[aryDoc.Count, aryDoc.Count];
for (int i = 0; i < NormMatrix.Count; i++)
{
var doc1 = NormMatrix.ElementAt(i).Value;
aryDocOutput[i, i] = 1;// 1= similar document
for (int j = i + 1; j < NormMatrix.Count; j++)
{
var doc2 = NormMatrix.ElementAt(j).Value;
aryDocOutput[i, j] = aryDocOutput[j, i] = calculateNormCosineVal(doc1, doc2);
}
}
#endregion
#region Print output
var lstDocKey=NormMatrix.Select(s => s.Key).ToList();
strTex = "," + string.Join(",",lstDocKey )+"\n";
for (int i = 0; i <= aryDocOutput.GetUpperBound(0); i++)
{
strTex += lstDocKey[i];
for (int j = 0; j <= aryDocOutput.GetUpperBound(0); j++)
{
string str = "" + aryDocOutput[i, j];
str = str.PadLeft(9);
strTex += "," + aryDocOutput[i, j];
}
strTex += "\n";
}
#endregion
File.WriteAllText(DocPath + "CosineMatrix.csv", strTex);
Console.WriteLine("Constructing Cosine Matrix : completed at " + DateTime.Now);
Console.WriteLine("Done");
Console.Read();
}
public static string[] GetKeyWordList(string input)
{
var words =
input.Split(Common.Splitters,StringSplitOptions.RemoveEmptyEntries).ToList();
var temp = 0;
Cleveland State University
Page 14
words.RemoveAll(s => Common.CommonWords.Where(r => r.Key == s).Select(p =>
p.Key).FirstOrDefault() != null || int.TryParse(s,out temp));
return
words.GroupBy(s=>s).ToList().OrderByDescending(s=>s.ToList().Count).Select(s=>s.Key).Dist
inct().ToArray().Take(KeywordCount).ToArray();
}
private static Dictionary<string, int> CalculateCounts(string doc)
{
Dictionary<string, int> aryCount = new Dictionary<string, int>();
foreach (var keyWord in aryKeyWord)
{
int count = doc.Split(' ').Where(x => x.Trim().ToLower() ==
keyWord.Trim()).ToList().Count;
if (count > 0)
{
var obj = lstdft.Where(i => i.Key.Trim() == keyWord.Trim()).FirstOrDefault();
obj.df++;
}
aryCount.Add(keyWord.Trim(), count);
}
strTex += string.Join(",", aryCount.Select(s => s.Value).ToList());
return aryCount;
}
public static Dictionary<string, double> CalculateWeightVal(string doc,Dictionary<string,
int> lsttf)
{
Dictionary<string, double> weightVector = new Dictionary<string,double>();
var score = 0.0;
foreach (var tf in lsttf)
{
var term=lstdft.Where(p=>p.Key.Trim()==tf.Key.Trim()).FirstOrDefault();
var weight = Math.Round((Math.Log10(1 + tf.Value) * Math.Log10(N / term.df)), 4);
weightVector.Add(term.Key, weight);
score += weight;
}
ScoreVecor.Add(doc, score);
strTex += String.Join(",", weightVector.Select(s => s.Value)) + ",," + score;
return weightVector;
}
public static Dictionary<string, double> CalculateNormalizeVal(Dictionary<string, double>
docVector)
Cleveland State University
Page 15
{
Dictionary<string, double> normVector = new Dictionary<string, double>();
var vectorVal = 0.0;
docVector.Select(s => s.Value).ToList().ForEach(s => { vectorVal += Math.Pow(s, 2);
});
foreach (var item in docVector)
{
var normVal = vectorVal!=0?Math.Round(item.Value / Math.Sqrt(vectorVal), 2):0;
normVector.Add(item.Key, normVal);
}
strTex += String.Join(",", normVector.Select(s => s.Value))+"\n";
return normVector;
}
private static double calculateNormCosineVal(Dictionary<string, double> doc1,
Dictionary<string, double> doc2)
{
double dotProduct = 0;
foreach (var key in aryKeyWord)
{
var d1 = doc1.Where(p => p.Key == key.Trim()).FirstOrDefault().Value;
var d2 = doc2.Where(p => p.Key == key.Trim()).FirstOrDefault().Value;
dotProduct += d1 * d2;
}
return Math.Round(dotProduct , 2);
}
}
}
Cleveland State University
Page 16
Output : All Excel output files are also attached with this document.
Console :
DocumentFreq.xls :
Recommended