1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
| import random import pandas import numpy
totalgene = pandas.read_table(r'gene_all.bed', usecols=[3], header=None) querygene = pandas.read_csv(r'querygene_with_count.csv', usecols=[0], header=None) querygenecount = pandas.read_csv(r'querygene_with_count.csv', usecols=[1], header=None)
totalgene_array = numpy.array(totalgene.stack()) totalgene_list = totalgene_array.tolist() querygene_array = numpy.array(querygene.stack()) querygene_list = querygene_array.tolist() querygenecount_array = numpy.array(querygenecount.stack()) querygenecount_list = querygenecount_array.tolist()
df = pandas.DataFrame() df.insert(loc=0, column='gene', value=querygene_list) df.insert(loc=1, column='querygenecount', value=querygenecount_list) df.insert(loc=2, column='count0', value=0) df.insert(loc=3, column='count1', value=0) df.insert(loc=4, column='count2', value=0) df.insert(loc=5, column='count3', value=0) df.insert(loc=6, column='pval', value=0) print (df)
i = 0 while i < 1000000: R1 = random.sample(totalgene_list, 100) R2 = random.sample(totalgene_list, 200) R3 = random.sample(totalgene_list, 300) R1R2 = [a for a in R1 if a in R2] R1R3 = [b for b in R1 if b in R3] R2R3 = [c for c in R2 if c in R3] R1R2R3 = [d for d in R1R2 if d in R2R3] setR1 = set(R1) setR2 = set(R2) setR3 = set(R3) set_totalgene = set(totalgene_list) selected0 = list(set_totalgene - setR1 - setR2 - setR3) selected2 = [e for e in R1R2 or R1R3 or R2R3]
df.count0 = numpy.where(df.gene.isin(selected0), df.count0+1, df.count0) df.count2 = numpy.where(df.gene.isin(selected2), df.count2+1, df.count2) df.count3 = numpy.where(df.gene.isin(R1R2R3), df.count3+1, df.count3) i += 1
df.count1 = 1000000 - df.count0 - df.count2 - df.count3
for oricount in df.querygenecount: if oricount == 2: df.pval = ((df.count2/1000000) + (df.count3/1000000)) elif oricount == 3: df.pval = df.count3/1000000 df = df.round({'pval':5})
df.to_csv('randomisation_test_distribution_1000000_pval.csv')
|