The present work is based on the paper by Alexander N. Gorban and Andrei Y. Zinovyev. The link to the paper.
The DNA nucleotide contains 4 bases A, T, G, C.
One distinctive message in a genomic sequence is a piece of text, called a gene.
It was one of many great discoveries of the twentieth century that biological information is encoded in genes by means of triplets of letters, called codons in the biological literature.
_Download the DNA sequences from NCBI site here_
import numpy as np
import itertools
import preprocessing
from sklearn import preprocessing
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
path = "../input/gnomes/Gnomes"
def loadGenome(path):
genome=""
with open(path, "r") as f:
_ = f.readline() # ignore the header
l = f.readline()
while l:
genome += l[:-1] # ignore new line
l = f.readline()
genome_size = len(genome)
return genome, genome_size
genome, genome_size = loadGenome(path+"/Covid.fasta")
genome_size
letters=list(set(genome))
letters
block_size = 350
word_size = [1,2,3,4]
num_blocks = genome_size//block_size
num_blocks
blocks = []
start_idx = 0
for idx in range(num_blocks):
end_idx = start_idx + block_size
blocks.append(genome[start_idx:end_idx])
start_idx = end_idx
if (idx + 1 == num_blocks):
blocks.append(genome[end_idx:genome_size])
def getFeatures(genome, num_blocks, word_size):
features={x : np.zeros((num_blocks, 4**x)) for x in word_size}
for ws in word_size:
lookUp = { y : x for x,y in enumerate([''.join(l) for l in itertools.product(*[letters]*ws)]) }
for b in range(num_blocks):
block = genome[b*block_size:(b+1)*block_size]
for i in range(block_size//ws):
word = block[i*ws:(i+1)*ws]
features[ws][b,lookUp[word]] += 1
return features
features = getFeatures(genome, num_blocks, word_size)
def standardize(features, word_size):
for ws in word_size:
std_scale = preprocessing.StandardScaler().fit(features[ws])
features[ws] = std_scale.transform(features[ws])
return features
features = standardize(features, word_size)
def runPCA(features, word_size, n_components=2):
featuresPCA = {}
for ws in word_size:
pca = PCA(n_components=n_components).fit(features[ws])
featuresPCA[ws] = pca.transform(features[ws])
return featuresPCA
featuresPCA = runPCA(features, word_size)
from matplotlib.pyplot import figure
fig, axes = plt.subplots(2,2, figsize=(15, 15))
axes[0,0].scatter(featuresPCA[1].T[0], featuresPCA[1].T[1], s=5)
axes[0,1].scatter(featuresPCA[2].T[0], featuresPCA[2].T[1], s=5)
axes[1,0].scatter(featuresPCA[3].T[0], featuresPCA[3].T[1], s=5)
axes[1,1].scatter(featuresPCA[4].T[0], featuresPCA[4].T[1], s=5)
kmeans = KMeans(n_clusters=3, random_state=0).fit(featuresPCA[3])
kmeans.labels_
plt.figure(figsize=(10,10))
plt.scatter(featuresPCA[3].T[0], featuresPCA[3].T[1], s = 5, c = kmeans.labels_ )
import sklearn.manifold
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
def runTSNE(features, word_size):
featuresTSNE={}
for ws in word_size:
featuresTSNE[ws] = tsne.fit_transform(features[ws])
return featuresTSNE
featuresTSNE = runTSNE(features, word_size)
from matplotlib.pyplot import figure
fig, axes = plt.subplots(2,2, figsize=(15, 15))
axes[0,0].scatter(featuresTSNE[1].T[0], featuresTSNE[1].T[1], s=5)
axes[0,1].scatter(featuresTSNE[2].T[0], featuresTSNE[2].T[1], s=5)
axes[1,0].scatter(featuresTSNE[3].T[0], featuresTSNE[3].T[1], s=5)
axes[1,1].scatter(featuresTSNE[4].T[0], featuresTSNE[4].T[1], s=5)
kmeans_tsne = KMeans(n_clusters=3, random_state=0).fit(featuresTSNE[3])
plt.figure(figsize=(10,10))
plt.scatter(featuresTSNE[3].T[0], featuresTSNE[3].T[1], s=2, c=kmeans_tsne.labels_)