defmap(key: string, values: string) -> List[(string, int)]: """ key: document name return 该函数返回一个[("to",1),("yours",12)]这样的列表数据 """ ans = [] content = get_doc(key) for word in content: ans.append((word,1)) return ans defreduce(key: string, values: List) -> List[(string, int)]: """ key: a word eg: "t1" values: a list of counts 示例: [1,2,3] return 该函数同样返回一个[("to",1),("yours",12)]的数据 """ int result = 0; for v in values: result += ParseInt(v); Emit(result)
# sorted返回一个排序好的list,因为list中的元素是一个个的tuple,key设定按照tuple中第几个元素排序 # groupby把迭代器中相邻的重复元素挑出来放在一起,key设定按照tuple中第几个元素为关键字来挑选重复元素 # 下面的循环中groupby返回的key是intermediate_key,而group是个list,是1个或多个 # 有着相同intermediate_key的(intermediate_key, intermediate_value) groups = {} for key, group in itertools.groupby(sorted(intermediate, key=lambda im: im[0]), key=lambda x: x[0]): groups[key] = [y for x, y in group] # groups是一个字典,其key为上面说到的intermediate_key,value为所有对应intermediate_key的intermediate_value # 组成的一个列表 # print(groups) return [reducer(intermediate_key, groups[intermediate_key]) for intermediate_key in groups]
classtest:
defget_most_common_from_text(self,text,n = 100): word_list = [x for x in jieba.cut(text) iflen(x) >= 2] return Counter(word_list).most_common(n)
defreducer(self,k,v): # k:词 v:词出现的次数 return k, sum(v) defrun(self): i = { "a":"The quick brown fox jumped over the lazy grey dogs.", "b":"That's one small step for a man, one giant leap for mankind.", "c":" Mary had a little lamb,Its fleece was white as snow;And everywhere that Mary went,The lamb was sure to go", "d":"I pledge to honor and defend you and yours above all others", "e":"To share in blessings and burdens, to be your advocate, your champion" } t = MapReduce.map_reduce(i,self.map,self.reducer) print(t)