KMR
wordcountpy.py
1 #!/usr/bin/env python3
2 # -*-coding: utf-8;-*-
3 
4 ## Word Count (2015-06-13)
5 
6 ## This ranks the words by their occurrence count in the "../LICENSE"
7 ## file. It can be run under MPI as follows:
8 ## $ mpiexec -n 4 python3 wordcountpy.py
9 
10 from mpi4py import MPI
11 import kmr4py
12 import re
13 
14 file_name = "../LICENSE"
15 
16 kmr = kmr4py.KMR("world")
17 
18 def read_words_from_a_file(kv, kvi, kvo, i, *_data):
19  file_ = open(file_name, "r")
20  for line in file_:
21  words = re.split(r"\W+", line.strip())
22  for w in words:
23  if (w != ""):
24  kvo.add(w, 1)
25  file_.close()
26 
27 def print_top_five(kv, kvi, kvo, i, *_data):
28  ## (NO FIELD VALUE IN KMR.MR BECAUSE IT IS A DUMMY).
29  (k, v) = kv
30  if (kmr.rank == 0 and i < 5):
31  print("#%s=%d" % (v, int(0 - k)))
32 
33 def sum_counts_for_a_word(kvvec, kvi, kvo, *_data):
34  count = 0
35  (k0, _) = kvvec[0]
36  for (_, v) in kvvec:
37  count += v
38  kvo.add(k0, -count)
39 
40 if (kmr.rank == 0): print("Ranking words...")
41 
42 kvs0 = kmr.emptykvs.map_once(False, read_words_from_a_file, key="cstring")
43 kvs1 = kvs0.shuffle()
44 kvs2 = kvs1.reduce(sum_counts_for_a_word, key="cstring", value="integer")
45 kvs3 = kvs2.reverse()
46 kvs4 = kvs3.sort()
47 kvs4.map(print_top_five, output=False, nothreading=True)
48 
49 kmr.dismiss()
50 kmr4py.fin()
def fin()
Definition: kmr4py.py:1357