Upload
yahoo-developer-network
View
5.798
Download
1
Embed Size (px)
DESCRIPTION
RHadoop is an open source project aiming to combine two rising star in the analytics firmament: R and Hadoop. With more than 2M users, R is arguably the dominant language to express complex statistical computations. Hadoop needs no introduction at HUG. With RHadoop we are trying to combine the expressiveness of R with the scalability of Hadoop and to pave the way for the statistical community to tackle big data with the tools they are familiar with. At this time RHadoop is a collection of three packages that interface with HDFS, HBase and mapreduce, respectively. For mapreduce, the package is called rmr and we tried to give it a simple, high level interface that's true to the mapreduce model and integrated with the rest of the language. We will cover the API and provide some examples.
Citation preview
RHadoop, Hadoop for R
r4stats.com
rhdfs
rhbase
rmr
sapply(data, function)
mapreduce(data, function)
#!/usr/bin/Rscript
library(rmr)
mapreduce(…)
Hive, Pig
Rmr, Rhipe, Dumbo, Pydoop
Hadoopy
Java, C++
Cascalog, Scalding, Scrunch
Cascading, Crunch
Rmr
Expose MR Hide MR
#!/usr/bin/pythonimport sysfrom math import fabsfrom org.apache.pig.scripting import Pig
filename = "student.txt"k = 4tolerance = 0.01
MAX_SCORE = 4MIN_SCORE = 0MAX_ITERATION = 100
# initial centroid, equally divide the spaceinitial_centroids = ""last_centroids = [None] * kfor i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":"
P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """)
converged = Falseiter_num = 0while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]\n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1
if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]\n")
import java.io.IOException;
import org.apache.pig.EvalFunc;import org.apache.pig.data.Tuple;
public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; }
}
mapreduce(input, output, map, reduce)
one or more hdfs paths or output of other mapreduce jobs
hdfs path, default to temp location
a function of two args returning a keyval(), default identity
a function of two args returning a keyval(), default none
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)
reduce = function(k, vv) keyval(k, length(vv))
condition = function(x) x >10
out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
x = from.dfs(hdfs.object)
hdfs.object = to.dfs(x)
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
mapreduce(input =
mapreduce(input = "pv_users",
map = function(k, v) keyval(v['userid'], v['gender']),
reduce = function(k, vv) keyval(k, vv[[1]]),
output = "pv_gender_sum",
map = function(k,v) keyval(v, 1)
reduce = function(k, vv) keyval(k, sum(unlist(vv)))
kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')){ newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = lapply(values(newCenters), unlist) newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters}
kmeans.iter = function(points, distfun, ncenters = length(centers), centers = NULL) { from.dfs( mapreduce(input = points, map = if (is.null(centers)) { function(k,v)keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = lapply(centers,function(c)distfun(c,v)) keyval(centers[[which.min(distances)]],v)}}, reduce = function(k,vv) keyval(NULL,apply(do.call(rbind,vv),2,mean))))}
input.specs, output.specscombinereduce.on.data.frametuning.paramsverbose
local, hadoop backendsprofilingmanaged IOoptimize
mapreduce(mapreduce(…
mapreduce(input = c(input1, input2), …)
equijoin(left.input = input1, right.input = input2, …)
out1 = mapreduce(…)mapreduce(input = out1, <xyz>)mapreduce(input = out1, <abc>)
abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
repogithub.com/RevolutionAnalytics/
RHadoop/
licenseApache 2.0
documentationR help, github wiki
Q/Agithub issue tracking
project leadDavid Champagne