pyspark.mllib.random

29 """ 30 Generator methods for creating RDDs comprised of i.i.d samples from 31 some distribution. 32 """ 33 34 @staticmethod

35 - def uniformRDD(sc, size, numPartitions=None, seed=None):

36 """ 37 Generates an RDD comprised of i.i.d. samples from the 38 uniform distribution U(0.0, 1.0). 39 40 To transform the distribution in the generated RDD from U(0.0, 1.0) 41 to U(a, b), use 42 C{RandomRDDs.uniformRDD(sc, n, p, seed)\ 43 .map(lambda v: a + (b - a) * v)} 44 45 >>> x = RandomRDDs.uniformRDD(sc, 100).collect() 46 >>> len(x) 47 100 48 >>> max(x) <= 1.0 and min(x) >= 0.0 49 True 50 >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions() 51 4 52 >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions() 53 >>> parts == sc.defaultParallelism 54 True 55 """ 56 jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed) 57 uniform = RDD(jrdd, sc, NoOpSerializer()) 58 return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))

59 60 @staticmethod

61 - def normalRDD(sc, size, numPartitions=None, seed=None):

62 """ 63 Generates an RDD comprised of i.i.d. samples from the standard normal 64 distribution. 65 66 To transform the distribution in the generated RDD from standard normal 67 to some other normal N(mean, sigma^2), use 68 C{RandomRDDs.normal(sc, n, p, seed)\ 69 .map(lambda v: mean + sigma * v)} 70 71 >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L) 72 >>> stats = x.stats() 73 >>> stats.count() 74 1000L 75 >>> abs(stats.mean() - 0.0) < 0.1 76 True 77 >>> abs(stats.stdev() - 1.0) < 0.1 78 True 79 """ 80 jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed) 81 normal = RDD(jrdd, sc, NoOpSerializer()) 82 return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))

83 84 @staticmethod

85 - def poissonRDD(sc, mean, size, numPartitions=None, seed=None):

86 """ 87 Generates an RDD comprised of i.i.d. samples from the Poisson 88 distribution with the input mean. 89 90 >>> mean = 100.0 91 >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=1L) 92 >>> stats = x.stats() 93 >>> stats.count() 94 1000L 95 >>> abs(stats.mean() - mean) < 0.5 96 True 97 >>> from math import sqrt 98 >>> abs(stats.stdev() - sqrt(mean)) < 0.5 99 True 100 """ 101 jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed) 102 poisson = RDD(jrdd, sc, NoOpSerializer()) 103 return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))

104 105 @staticmethod

106 - def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):

107 """ 108 Generates an RDD comprised of vectors containing i.i.d. samples drawn 109 from the uniform distribution U(0.0, 1.0). 110 111 >>> import numpy as np 112 >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) 113 >>> mat.shape 114 (10, 10) 115 >>> mat.max() <= 1.0 and mat.min() >= 0.0 116 True 117 >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() 118 4 119 """ 120 jrdd = sc._jvm.PythonMLLibAPI() \ 121 .uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) 122 uniform = RDD(jrdd, sc, NoOpSerializer()) 123 return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))

124 125 @staticmethod

126 - def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):

127 """ 128 Generates an RDD comprised of vectors containing i.i.d. samples drawn 129 from the standard normal distribution. 130 131 >>> import numpy as np 132 >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect()) 133 >>> mat.shape 134 (100, 100) 135 >>> abs(mat.mean() - 0.0) < 0.1 136 True 137 >>> abs(mat.std() - 1.0) < 0.1 138 True 139 """ 140 jrdd = sc._jvm.PythonMLLibAPI() \ 141 .normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) 142 normal = RDD(jrdd, sc, NoOpSerializer()) 143 return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))

144 145 @staticmethod

146 - def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):

147 """ 148 Generates an RDD comprised of vectors containing i.i.d. samples drawn 149 from the Poisson distribution with the input mean. 150 151 >>> import numpy as np 152 >>> mean = 100.0 153 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L) 154 >>> mat = np.mat(rdd.collect()) 155 >>> mat.shape 156 (100, 100) 157 >>> abs(mat.mean() - mean) < 0.5 158 True 159 >>> from math import sqrt 160 >>> abs(mat.std() - sqrt(mean)) < 0.5 161 True 162 """ 163 jrdd = sc._jvm.PythonMLLibAPI() \ 164 .poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed) 165 poisson = RDD(jrdd, sc, NoOpSerializer()) 166 return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))

Source Code for Module pyspark.mllib.random