# -*- coding: utf-8 -*- """ Created on Mon Mar 2 08:44:28 2020 @author: Johan Rene van Dorp """ import numpy as np import matplotlib.pyplot as plt from scipy.stats import binom from scipy.stats import t import pandas as pd import Dist_Library as dl # This function calculates the hamming distance between two binary # vectors of the same length def Hamming_distance(x,y): z = sum(abs(x-y)) n = len(x) return (z/n) # This function samples the difference between John and Mary m times # using n coin tosses to identify them def John_and_Mary_Difference(m,n,p): sample = np.zeros(m) for i in np.arange(m): # Bernouilli random sample of size n with success probability p # for both Jonn and Mary John = np.random.binomial(1,p,n) Mary = np.random.binomial(1,p,n) # Evaluating the Hamming Distrance between John and Mary H_dist = Hamming_distance(John,Mary) sample[i] = H_dist return sample # We are supposing the John and Mary are identified by the_n Bernoulli trials # each with sucches probability the_p the_n = 10 bern_sample_size = 1 the_prob = 0.5 # Sampling from a bernoulli random variable is the same as # sampling from a binomial random variable with sample size 1 # Here we are generating 10 coin tosses for John and Mary John = np.random.binomial(bern_sample_size,the_prob,the_n) Mary = np.random.binomial(bern_sample_size,the_prob,the_n) H_dist = Hamming_distance(John,Mary) print('John:',John) print('Mary:',Mary) print('H_dist:',H_dist) # Create random sample of size m of distances between John and Mary # to estimatime empirically the pmf of the Hamming Distances between # Johan and Mary the_m = 100 JM_Sample = John_and_Mary_Difference(the_m,the_n,the_prob) # Evaluating the empirical and theoretical probabilities the_outcomes = np.arange(0,the_n+1)/the_n the_emp_probs = dl.Estimate_empirical_pmf(the_outcomes,JM_Sample) the_theo_probs = binom.pmf(np.arange(0,the_n+1), the_n, the_prob) JM_pmf_data = pd.DataFrame({'Outcomes':the_outcomes, 'Theoretical PMF':the_theo_probs, 'Empirical PMF':the_emp_probs}) print(JM_pmf_data) # Evaluating the bounds, widths and height for the bars in the histograms the_bounds, the_widths = dl.Calc_boundaries_and_widths(the_outcomes) # Plotting the theoretical and empirical probability mass functions plt.rc('font', family='serif', size='12') JM_figure = plt.figure(1) JM_figure.set_figwidth(9) JM_figure.set_figheight(4.5) JM_figure.subplots_adjust(left=0.1, bottom=0.15, right=0.95, top=0.9, hspace=0.1, wspace=0.3) # Adding the left panel with the theoretical pmf Left_Panel = JM_figure.add_subplot(1,2,1) # Evaluating the x-axis and y-axis limits x_lims = (the_bounds[0], the_bounds[len(the_bounds)-1]) max_height = 1.1*max(max(the_theo_probs),max(the_emp_probs)) y_lims = (0,max_height+max_height/10) plt.xlim(x_lims) plt.ylim(y_lims) plt.xlabel('$H_{10}(J,M)$ Distance') plt.ylabel('Theoretical pmf') dl.add_pmf_prob_hist_to_figure_panel(Left_Panel,the_outcomes,the_theo_probs,'indianred',0.5) # Adding an confidence interval error bar for the fourth outcome bar_p = the_theo_probs[3] alpha = 0.05 SE = ((bar_p*(1-bar_p))/the_m)**0.5 t_q = t.ppf(1-alpha/2,the_m-1) Half_Width = 0 plt.errorbar(the_outcomes[3], bar_p, yerr = Half_Width, fmt='o', color='darkgreen', capsize = 5,label = 'Theo. Value') text_value = bar_p text_str = f'{text_value:3.2f}' plt.text(the_outcomes[3]-0.075,text_value,text_str,color = 'red',size = 12,verticalalignment = 'center', horizontalalignment = 'right') plt.legend(loc='best') # Adding the right panel with the empirical pmf Right_Panel = JM_figure.add_subplot(1,2,2) plt.xlim(x_lims) plt.ylim(y_lims) plt.xlabel('$H_{10}(J,M)$ Distance') plt.ylabel('Empirical pmf') dl.add_pmf_prob_hist_to_figure_panel(Right_Panel,the_outcomes,the_emp_probs,'skyblue',0.5) text_str = 'm = ' +str(the_m) plt.text(0.1,max_height,text_str,color = 'red') # Adding an confidence interval error bar for the fourth outcome bar_p = the_emp_probs[3] alpha = 0.05 SE = ((bar_p*(1-bar_p))/the_m)**0.5 t_q = t.ppf(1-alpha/2,the_m-1) Half_Width = t_q*SE plt.errorbar(the_outcomes[3], bar_p, yerr = Half_Width, fmt='o', color='darkgreen', capsize = 5,label = '95% Conf. Int.') text_value = bar_p + Half_Width text_str = f'{text_value:3.2f}' plt.text(the_outcomes[3]-0.075,text_value,text_str,color = 'red',size = 12,verticalalignment = 'center', horizontalalignment = 'right') text_value = bar_p - Half_Width text_str = f'{text_value:3.2f}' plt.text(the_outcomes[3]-0.075,text_value,text_str,color = 'red',size = 12,verticalalignment = 'center', horizontalalignment = 'right') text_value = bar_p text_str = f'{text_value:3.2f}' plt.text(the_outcomes[3]-0.075,text_value,text_str,color = 'red',size = 12,verticalalignment = 'center', horizontalalignment = 'right') plt.legend(loc='best') JM_figure.suptitle('Distance between John and Mary',size='16') plt.savefig('Figure1.png', dpi=300) # Plotting the theoretical and empirical cumulative distributuon functionsw plt.rc('font', family='serif', size='12') JM_figure = plt.figure(2) JM_figure.set_figwidth(9) JM_figure.set_figheight(4.5) JM_figure.subplots_adjust(left=0.1, bottom=0.15, right=0.95, top=0.9, hspace=0.1, wspace=0.3) # Adding the left panel with the theoretical pmf Left_Panel = JM_figure.add_subplot(1,2,1) # Evaluating the x-axis and y-axis limits x_lims = (the_bounds[0], the_bounds[len(the_bounds)-1]) y_lims = (0,1.1) plt.xlim(x_lims) plt.ylim(y_lims) plt.xlabel('$H_{10}(J,M)$ Distance') plt.ylabel('Theoretical cdf') dl.plot_discrete_cdf(Left_Panel,the_outcomes,the_theo_probs,'indianred',2) # Adding an confidence interval error bar for the fourth outcome bar_p = np.sum(the_theo_probs[4]) alpha = 0.05 SE = ((bar_p*(1-bar_p))/the_m)**0.5 t_q = t.ppf(1-alpha/2,the_m-1) Half_Width = 0 plt.errorbar((the_outcomes[4]+the_outcomes[5])/2, bar_p, yerr = Half_Width, fmt='o', color='darkgreen', capsize = 5,label = 'Theo. Value') text_value = bar_p text_str = f'{text_value:3.2f}' plt.text(the_outcomes[4]-0.025,text_value,text_str,color = 'red',size = 12,verticalalignment = 'center', horizontalalignment = 'right') plt.legend(loc='best') # Adding the right panel with the empirical pmf Right_Panel = JM_figure.add_subplot(1,2,2) plt.xlim(x_lims) plt.ylim(y_lims) plt.xlabel('$H_{10}(J,M)$ Distance') plt.ylabel('Empirical cdf') dl.plot_discrete_cdf(Right_Panel,the_outcomes,the_emp_probs,'skyblue',2) text_str = 'm = ' +str(the_m) plt.text(0.1,0.9,text_str,color = 'red') # Adding an confidence interval error bar for the fourth outcome bar_p = the_emp_probs[4] alpha = 0.05 SE = ((bar_p*(1-bar_p))/the_m)**0.5 t_q = t.ppf(1-alpha/2,the_m-1) Half_Width = t_q*SE plt.errorbar((the_outcomes[4]+the_outcomes[5])/2, bar_p, yerr = Half_Width, fmt='o', color='darkgreen', capsize = 5,label = '95% Conf. Int.') text_value = bar_p + Half_Width text_str = f'{text_value:3.2f}' plt.text(the_outcomes[4]-0.025,text_value,text_str,color = 'red',size = 12,verticalalignment = 'center', horizontalalignment = 'right') text_value = bar_p - Half_Width text_str = f'{text_value:3.2f}' plt.text(the_outcomes[4]-0.025,text_value,text_str,color = 'red',size = 12,verticalalignment = 'center', horizontalalignment = 'right') text_value = bar_p text_str = f'{text_value:3.2f}' plt.text(the_outcomes[4]-0.025,text_value,text_str,color = 'red',size = 12,verticalalignment = 'center', horizontalalignment = 'right') plt.legend(loc='best') JM_figure.suptitle('Distance between John and Mary',size='16') plt.savefig('Figure2.png', dpi=300)