44
HDInsight Programming

Hd insight programming

Embed Size (px)

Citation preview

HDInsight Programming

Port

• HDFS: http://localhost:50070/

• Oozie: http://localhost:11000/oozie/v1/admin/status

• Templeton: http://localhost:50111/templeton/v1/status

• ODBC: use port 10000 in DSN configuration or connection string.

HDFS WebClient

Nuget Microsoft.NET API for Hadoop WebClient

WebHDFS

List Directory

var client = new WebHDFSClient(new Uri("http://localhost:50070"),"hadoop");!client.GetDirectoryStatus("/").ContinueWith(dl => dl.Result.Directories.ToList().ForEach(d => Console.WriteLine("/" + d.PathSuffix)));

Create Directory

var client = new WebHDFSClient(new Uri("http://localhost:50070"), "hadoop");!var created = await client.CreateDirectory("/TEST");!Console.WriteLine("True or False, we created the directory " + created.ToString());!var deleted = await client.DeleteDirectory("/TEST");!Console.WriteLine("True or False, we deleted the directory " + deleted.ToString());

Task Chaining

client.CreateDirectory("/TEST")! .ContinueWith(x => client.CreateFile(@"c:\tmp\Titles.txt", "/user/hadoop/titles.txt")! .ContinueWith(t => Console.WriteLine("new file located at " + t.Result))! .ContinueWith(t => client.OpenFile("/user/hadoop/titles.txt")! .ContinueWith(! resp => resp.Result.Content.ReadAsStringAsync()! .ContinueWith(bigString => Console.WriteLine("new file is " + bigString.Result.Length + " bytes long"))! .ContinueWith(! t2 => client.DeleteDirectory("/user/hadoop/titles.txt")! .ContinueWith(b => Console.WriteLine("Successfully deleted file."))! )! )! )! );

WebHCat

• Management of HCatalog metadata.

• Hive job submission.

• Pig job submission.

• Map/Reduce job submission.

• Streaming Map/Reduce job submission.

CreateHive

string outputDir = "basichivejob";! var client = new WebHCatHttpClient(new Uri("http://localhost:50111"), "administrator", "", "hadoop");! var t1 = client.CreateHiveJob(@"select * from src;", null, null, outputDir, null);! t1.Wait();! var response = t1.Result;! var output = response.Content.ReadAsAsync<JObject>();! output.Wait();! response.EnsureSuccessStatusCode();! string id = output.Result.GetValue("id").ToString();! client.WaitForJobToCompleteAsync(id).Wait();

using System.Net.Http;

Oozie

http://hadoopsdk.codeplex.com/wikipage?title=Oozie%20Client&referringTitle=Home

.NET MapReduce

MRRunner

Mapper

public class SqrtMapper : MapperBase! {! public override void Map(string inputLine, MapperContext context)! {! int inputValue = int.Parse(inputLine);!! // Perform the work.! double sqrt = Math.Sqrt((double)inputValue);!! // Write output data.! context.EmitKeyValue(inputValue.ToString(), sqrt.ToString());! }! }

Hadoop Job

public class FirstJob : HadoopJob<Mapper,Combiner,Reducer>! {! public override HadoopJobConfiguration Configure(ExecutorContext context)! {! HadoopJobConfiguration config = new HadoopJobConfiguration();! config.InputPath = "input/SqrtJob";! config.OutputFolder = "output/SqrtJob";! return config;! }! }!

var hadoop = Hadoop.Connect(); hadoop.MapReduceJob.ExecuteJob<JobType>(arguments);

MRRunner -dll MyMRProgram.dll {-class jobClass} {-- job-class options}

Linq to Hive

HiveRowpublic class TitlesRow : HiveRow! {! public string MovieId { get; set; }! public string Name { get; set; }! public int Year { get; set; }! public string Rating { get; set; }! }!! public class AwardsRow : HiveRow! {! public string MovieId { get; set; }! public string AwardId { get; set; }! public int Year { get; set; }! public string Won { get; set; }! public string Type { get; set; }! public string Category { get; set; }! }!! public class ActorsRow : HiveRow! {! public string MovieId { get; set; }! public string ActorId { get; set; }! public int AwardsCount { get; set; }! public string Name { get; set; }!

HiveConnection public class MyHiveDatabase : HiveConnection! {! public MyHiveDatabase(Uri webHcatUri, string username, string password, string azureStorageAccount, string azureStorageKey) : base(webHcatUri, username, password, azureStorageAccount, azureStorageKey) { }!! public HiveTable<AwardsRow> Awards! {! get! {! return this.GetTable<AwardsRow>("Awards");! }! }!! public HiveTable<TitlesRow> Titles! {! get! {! return this.GetTable<TitlesRow>("Titles");! }! }!! public HiveTable<ActorsRow> Actors! {! get! {! return this.GetTable<ActorsRow>("Actors");! }! }! }

Simple Linqvar db = new MyHiveDatabase(! webHCatUri: new Uri("http://localhost:50111"),! userName: "hadoop", password: null,! storageAccount: “ASV storage account name”, storageKey: “ASV storage account key”);!! var q = from x in! (from a in db.Actors! select new { a.ActorId, foo = a.AwardsCount })! group x by x.ActorId into g! select new { ActorId = g.Key, bar = g.Average(z => z.foo) };!! q.ExecuteQuery().Wait();! var results1 = q.ToList();!!! var projectionQuery = from aw in db.Awards! join t in db.Titles! on aw.MovieId equals t.MovieId! where t.Year == 1994 && aw.Won == "True"! select new { MovieId = t.MovieId, Name = t.Name, Type = aw.Type, Category = aw.Category, Year = t.Year };!!! var newTable = projectionQuery.CreateTable("AwardsIn1994");

Excel ODBC

http://www.microsoft.com/en-us/download/details.aspx?id=40886

Resource

• http://hadoopsdk.codeplex.com/

• https://github.com/WindowsAzure-Samples/HDInsight-Labs-Preview

• http://wag.codeplex.com/

Mahout

Machine Learning is programming computers to optimize a

performance criterion using example data or past experience

Classification

Clustering

Recommenders

Collaborative Filtering - User Based

Collaborative Filtering - Item Based

Data

http://labrosa.ee.columbia.edu/millionsong/tasteprofile

http://www.grouplens.org/node/12

Mahout Command

c:\apps\dist\mahout-0.7\bin>hadoop jar c:\Apps\dist\mahout-0.7\mahout-core-0.7-job.jar

org.apache.mahout.cf.taste.hadoop.item.RecommenderJob -s SIMILARITY_COOCCURRENCE --input=input/mInput.txt --

output=output --usersFile=input/users.txt!