zipf_curve.py
# Importsimport osfrom matplotlib import pyplot as pltimport stringimport numpy as npfrom scipy.interpolate import make_interp_spline# define some dictionariestexts = {}textlengths = {}textwordamounts = {}unwantedCharacters = list(string.punctuation)# How many ranks well showdepth = 10xAxis = [str(number) for number in range(1, depth+1)]# Getting all files in text folderfilePaths = os.listdir('texts')# Getting text from .txt files in folderfor path in filePaths: with open(os.path.join('texts', path), 'r', encoding='UTF-8') as f: texts[path.split('.')[0]] = f.read()# Cleaning and counting the Textfor text in texts: # Remove unwanted characters from the texts for character in unwantedCharacters: texts[text] = texts[text].replace(character, '').lower() splittedText = texts[text].split(' ') # Saving the text length to show in the label of the line later textlengths[text] = len(splittedText) # Here will be the amount of occurence of each word stored textwordamounts[text] = {} # Loop through all words in the text for i in splittedText: # Add to the word at the given position if it already exists # Else set the amount to one essentially making a new item in the dict if i in textwordamounts[text].keys(): textwordamounts[text][i] += 1 else: textwordamounts[text][i] = 1 # Sorting the dict by the values with sorted # define custom key so the function knows what to use when sorting textwordamounts[text] = dict( sorted( textwordamounts[text ].items(), key=lambda x: x[1], reverse=True)[0:depth] )# Get the percentage value of a given max valuedef percentify(value, max): return round(value / max * 100)# Generate smooth curvessdef smoothify(yInput): x = np.array(range(0, depth)) y = np.array(yInput) # define x as 600 equally spaced values between the min and max of original x x_smooth = np.linspace(x.min(), x.max(), 600) # define spline with degree k=3, which determines the amount of wiggle spl = make_interp_spline(x, y, k=3) y_smooth = spl(x_smooth) # Return the twe x and y axis return x_smooth, y_smooth# Make the perfect CurveziffianCurveValues = [100/i for i in range(1, depth+1)]x, y = smoothify(ziffianCurveValues)plt.plot(x, y, label='Ziffian Curve', ls=':', color='grey')# Plot the textsfor i in textwordamounts: maxValue = list(textwordamounts[i].values())[0] yAxis = [percentify(value, maxValue) for value in list(textwordamounts[i].values())] x, y = smoothify(yAxis) plt.plot(x, y, label=i+f' [{textlengths[i]}]', lw=1, alpha=0.5)plt.xticks(range(0, depth), xAxis)plt.legend()plt.savefig('wordamounts.png', dpi=300)plt.show()