how to find word vertically in a crossword - python-3.4

I'm trying to write a function that accepts a 2-dimensional (2D) list of characters (like a crossword puzzle) and a string as input arguments, the function must then search the columns of the 2D list to find a match of the word. If a match is found, the function should then return a list containing the row index and column index of the start of the match, otherwise it should return the value None.
For example if the function is called as shown below:
crosswords = [['s','d','o','g'],['c','u','c','m'],['a','c','a','t'],['t','e','t','k']]
word = 'cat'
find_word_vertical(crosswords,word)
then the function should return:
[1,0]

def find_word_vertical(crosswords,word):
columns = []
finished = []
for col in range(len(crosswords[0])):
columns.append( [crosswords[row][col] for row in
range(len(crosswords))])
for a in range(0, len(crosswords)):
column = [crosswords[x][a] for x in range(len(crosswords))]
finished.append(column)
for row in finished:
r=finished.index(row)
whole_row = ''.join(row)
found_at = whole_row.find(word)
if found_at >=0:
return([found_at, r])

This one is for finding horizontal... could switching this around help?
def find_word_horizontal(crosswords, word):
list1=[]
row_index = -1
column_index = -1
refind=''
for row in crosswords:
index=''
for column in row:
index= index+column
list1.append(index)
for find_word in list1:
if word in find_word:
row_index = list1.index(find_word)
refind = find_word
column_index = find_word.index(word)
ret = [row_index,column_index]
if row_index!= -1 and column_index != -1:
return ret

The simple version is:
def find_word_vertical(crosswords,word):
z=[list(i) for i in zip(*crosswords)]
for rows in z:
row_index = z.index(rows)
single_row = ''.join(rows)
column_index = single_row.find(word)
if column_index >= 0:
return([column_index, row_index])
This gives correct output [1,0]

To find a word vertically:
def find_word_vertical(crosswords,word):
if not crosswords or not word:
return None
for col_index in range(len(crosswords[0])):
str = ''
for row_index in range(len(crosswords)):
str = str + crosswords[row_index][col_index]
if temp_str.find(word) >= 0:
return [str.find(word),col_index]

To find a word Horizontaly:
def find_word_horizontal(crosswords, word):
if not crosswords or not word:
return None
for index, row in enumerate(crosswords):
str = ''.join(row)
if str.find(word) >= 0:
return [index,str.find(word)]

#find vertical word in 2d
def find_it(li,wo):
out_list=[]
for row in range(len(li)):
print(row)
chek_word=""
for item in range(len(li)):
chek_word=chek_word + li[item][row]
print(chek_word)
if wo in chek_word:
print(chek_word.find(wo))
out_list=[ chek_word.find(wo) , row]
print(out_list)
break
this is mine and yes it work

Related

Creating dictionary from a '.fasta' file containing several genes from an organism

I have a '.txt' file in which a list of genes are given and their sequence. I need to create a dictionary in which the keys are the names of the genes and the values are the sequences.
I want the output of the dictionary to be this:
dict = ('sequence1' : 'AATTGGCC', 'sequence2' : 'AAGGCCTT', ...)
So this is what I tried, but I ran into some problems:
dictionary = {}
accesion_number = ""
sequentie = ""
with open("6EP.fasta", "r") as proteoom:
for line in proteoom:
if line.startswith(">"):
line.strip()
dictionary[accesion_number] = sequentie
sequentie = ""
else:
sequentie = sequentie + line.rstrip().strip("\n").strip("\r")
dictionary[accesion_number] = sequentie
Does anyone know what went wrong here, and how I can fix it?
Thanks in advance!
I can think of two ways to do this:
High memory usage
If the file is not too large, you can use readlines() and then use the indexes like so:
IDs = []
sequences = []
with open('Proteome.fasta', 'r') as f:
raw_data = f.readlines()
for i, l in enumerate(raw_data):
if l[0] == '>':
IDs.append(l)
sequences.append(raw_data[i + 1])
Low memory usage
Now, if you don't want to load the contents of the file into memory, then I think you can read the file twice by saving the indexes of every ID line plus one, like so:
Get the '>' lines and their indexes, which will be the ID index plus one
Compare if the line number is in the indexes list and, if so, then append the content to your variable
In here, I'm taking advantage of the fact that the lists are, by definition, sorted.
IDs = []
indexes = []
sequences = []
with open('Proteome.fasta', 'r') as f:
for i, l in enumerate(f):
IDs.append(l) # Get your IDs
indexes.append(i + 1) # Get the index of the ID + 1
with open('Proteome.fasta', 'r') as f:
for i, l in enumerate(f):
if i == indexes[0]: # Check whether line matches with the index
sequences.append(l) # Get your sequence
indexes.pop(0) # Remove the first element of the indexes
I hope this helps! ;)
Code
ids = []
seq = []
char = ['_', ':', '*', '#'] #invalid in sequence
seqs = ''
with open('fasta.txt', 'r') as f: #open sample fasta
for line in f:
if line.startswith('>'):
ids.append(line.strip('\n'))
if seqs != '': #if there's previous seq
seq.append(seqs) #append the seq
seqs = '' #then start a new seq
elif line not in char:
seqs = seqs + line.strip('\n') #build seq with each line until '>'
seq.append(seqs) #append any remaining seq
print(ids)
print(seq)
Result
['>SeqABCD [organism=Mus musculus]', '>SeqABCDE [organism=Plasmodium]']
['ACGTCAGTCACGTACGTCAGTTCAGTC...', 'GGTACTGCAAAGTTCTTCCGCCTGATTA...']
Sample File
>SeqABCD [organism=Mus musculus]
ACGTCAGTCACGTACGTCAGTTCAGTCARYSTYSATCASMBMBDH
ATCGTTTTTATGTAATTGCTTATTGTTGTGTGTAGATTTTTTAA
AAATATCATTTGAGGTCAATACAAATCCTATTTCTATCGTTTTT
CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAAT
>SeqABCDE [organism=Plasmodium falciparum]
GGTACTGCAAAGTTCTTCCGCCTGATTAATTATCCATTTTACCTT
TTGTTTTGCTTCTTTGAAGTAGTTTCTCTTTGCAAAATTCCTCTT
GGTACTGCAAAGTTCTTCCGCCTGATTAATTATCCGGTACTGCAA
AGTCAATTTTATATAATTTAATCAAATAAATAAGTTTATGGTTAA

Code for sorting letters of a word is not working?

For solving this question in Hackerrank,I wrote the following piece of code.It worked well on my machine, but when it was submitted,it is evaluated as a wrong answer.
T = int(input().strip())
arr = []
result = []
for i in range(T):
s = input().strip()
arr.append(s)
for item in arr:
odd = [];even = []
for value in item:
n = item.index(value)
if n%2 ==0:
even.append(value)
if n%2 == 1:
odd.append(value)
p = ''.join(even) ; q = ''.join(odd)
result.append(p + " " + q)
odd.clear();even.clear();
for value in result:
print(value)
Try this test case:
2
aaB
Baa
Your code is giving wrong results. Inner for loop is looping the characters. What if two characters are same? Your code will give wrong index number.

Create a Python recursive function that prints all the integers less than or equal to the value provided in the original call

i=0
def recursiveIntegers(n):
if n==1:
return 1;
else:
reqval= n-1;
print("less than or equal to the original -->",reqval);
return recursiveIntegers(reqval)
userValue = int(input("Enter value "))
recursiveIntegers(userValue)
What am i missing for this to print out equal values..?
Print 'n' first ... then do that 'reqval = n-1'

Longest substring in alphabetical order [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 8 years ago.
Improve this question
Write a program that prints the longest substring of s in which the letters occur in alphabetical order. For example, if s = 'azcbobobegghakl', then your program should print
Longest substring in alphabetical order is: beggh
In the case of ties, print the first substring. For example, if s = 'abcbcd', then your program should print
Longest substring in alphabetical order is: abc
Here you go edx student i've been helped to finish the code :
from itertools import count
def long_sub(input_string):
maxsubstr = input_string[0:0] # empty slice (to accept subclasses of str)
for start in range(len(input_string)): # O(n)
for end in count(start + len(maxsubstr) + 1): # O(m)
substr = input_string[start:end] # O(m)
if len(substr) != (end - start): # found duplicates or EOS
break
if sorted(substr) == list(substr):
maxsubstr = substr
return maxsubstr
sub = (long_sub(s))
print "Longest substring in alphabetical order is: %s" %sub
These are all assuming you have a string (s) and are needing to find the longest substring in alphabetical order.
Option A
test = s[0] # seed with first letter in string s
best = '' # empty var for keeping track of longest sequence
for n in range(1, len(s)): # have s[0] so compare to s[1]
if len(test) > len(best):
best = test
if s[n] >= s[n-1]:
test = test + s[n] # add s[1] to s[0] if greater or equal
else: # if not, do one of these options
test = s[n]
print "Longest substring in alphabetical order is:", best
Option B
maxSub, currentSub, previousChar = '', '', ''
for char in s:
if char >= previousChar:
currentSub = currentSub + char
if len(currentSub) > len(maxSub):
maxSub = currentSub
else: currentSub = char
previousChar = char
print maxSub
Option C
matches = []
current = [s[0]]
for index, character in enumerate(s[1:]):
if character >= s[index]: current.append(character)
else:
matches.append(current)
current = [character]
print "".join(max(matches, key=len))
Option D
def longest_ascending(s):
matches = []
current = [s[0]]
for index, character in enumerate(s[1:]):
if character >= s[index]:
current.append(character)
else:
matches.append(current)
current = [character]
matches.append(current)
return "".join(max(matches, key=len))
print(longest_ascending(s))
The following code solves the problem using the reduce method:
solution = ''
def check(substr, char):
global solution
last_char = substr[-1]
substr = (substr + char) if char >= last_char else char
if len(substr) > len(solution):
solution = substr
return substr
def get_largest(s):
global solution
solution = ''
reduce(check, list(s))
return solution

alignment of sequences

I want to do pairwise alignment with uniprot and pdb sequences. I have an input file containing uniprot and pdb IDs like this.
pdb id uniprot id
1dbh Q07889
1e43 P00692
1f1s Q53591
first, I need to read each line in an input file
2) retrieve the pdb and uniprot sequences from pdb.fasta and uniprot.fasta files
3) Do alignment and calculate sequence identity.
Usually, I use the following program for pairwise alignment and seq.identity calculation.
library("seqinr")
seq1 <- "MDEKRRAQHNEVERRRRDKINNWIVQLSKIIPDSSMESTKSGQSKGGILSKASDYIQELRQSNHR"
seq2<- "MKGQQKTAETEEGTVQIQEGAVATGEDPTSVAIASIQSAATFPDPNVKYVFRTENGGQVM"
library(Biostrings)
globalAlign<- pairwiseAlignment(seq1, seq2)
pid(globalAlign, type = "PID3")
I need to print the output like this
pdbid uniprotid seq.identity
1dbh Q07889 99
1e43 P00692 80
1f1s Q53591 56
How can I change the above code ? your help would be appreciated!
'
This code is hopefully what your looking for:
class test():
def get_seq(self, pdb,fasta_file): # Get sequences
from Bio.PDB.PDBParser import PDBParser
from Bio import SeqIO
aa = {'ARG':'R','HIS':'H','LYS':'K','ASP':'D','GLU':'E','SER':'S','THR':'T','ASN':'N','GLN':'Q','CYS':'C','SEC':'U','GLY':'G','PRO':'P','ALA':'A','ILE':'I','LEU':'L','MET':'M','PHE':'F','TRP':'W','TYR':'Y','VAL':'V'}
p=PDBParser(PERMISSIVE=1)
structure_id="%s" % pdb[:-4]
structure=p.get_structure(structure_id, pdb)
residues = structure.get_residues()
seq_pdb = ''
for res in residues:
res = res.get_resname()
if res in aa:
seq_pdb = seq_pdb+aa[res]
handle = open(fasta_file, "rU")
for record in SeqIO.parse(handle, "fasta") :
seq_fasta = record.seq
handle.close()
self.seq_aln(seq_pdb,seq_fasta)
def seq_aln(self,seq1,seq2): # Align the sequences
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
matrix = matlist.blosum62
gap_open = -10
gap_extend = -0.5
alns = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend)
top_aln = alns[0]
aln_seq1, aln_seq2, score, begin, end = top_aln
with open('aln.fasta', 'w') as outfile:
outfile.write('> PDB_seq\n'+str(aln_seq1)+'\n> Uniprot_seq\n'+str(aln_seq2))
print aln_seq1+'\n'+aln_seq2
self.seq_id('aln.fasta')
def seq_id(self,aln_fasta): # Get sequence ID
import string
from Bio import AlignIO
input_handle = open("aln.fasta", "rU")
alignment = AlignIO.read(input_handle, "fasta")
j=0 # counts positions in first sequence
i=0 # counts identity hits
for record in alignment:
#print record
for amino_acid in record.seq:
if amino_acid == '-':
pass
else:
if amino_acid == alignment[0].seq[j]:
i += 1
j += 1
j = 0
seq = str(record.seq)
gap_strip = seq.replace('-', '')
percent = 100*i/len(gap_strip)
print record.id+' '+str(percent)
i=0
a = test()
a.get_seq('1DBH.pdb','Q07889.fasta')
This outputs:
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------EQTYYDLVKAF-AEIRQYIRELNLIIKVFREPFVSNSKLFSANDVENIFSRIVDIHELSVKLLGHIEDTVE-TDEGSPHPLVGSCFEDLAEELAFDPYESYARDILRPGFHDRFLSQLSKPGAALYLQSIGEGFKEAVQYVLPRLLLAPVYHCLHYFELLKQLEEKSEDQEDKECLKQAITALLNVQSG-EKICSKSLAKRRLSESA-------------AIKK-NEIQKNIDGWEGKDIGQCCNEFI-EGTLTRVGAKHERHIFLFDGL-ICCKSNHGQPRLPGASNAEYRLKEKFF-RKVQINDKDDTNEYKHAFEIILKDENSVIFSAKSAEEKNNW-AALISLQYRSTL---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
MQAQQLPYEFFSEENAPKWRGLLVPALKKVQGQVHPTLESNDDALQYVEELILQLLNMLCQAQPRSASDVEERVQKSFPHPIDKWAIADAQSAIEKRKRRNPLSLPVEKIHPLLKEVLGYKIDHQVSVYIVAVLEYISADILKLVGNYVRNIRHYEITKQDIKVAMCADKVLMDMFHQDVEDINILSLTDEEPSTSGEQTYYDLVKAFMAEIRQYIRELNLIIKVFREPFVSNSKLFSANDVENIFSRIVDIHELSVKLLGHIEDTVEMTDEGSPHPLVGSCFEDLAEELAFDPYESYARDILRPGFHDRFLSQLSKPGAALYLQSIGEGFKEAVQYVLPRLLLAPVYHCLHYFELLKQLEEKSEDQEDKECLKQAITALLNVQSGMEKICSKSLAKRRLSESACRFYSQQMKGKQLAIKKMNEIQKNIDGWEGKDIGQCCNEFIMEGTLTRVGAKHERHIFLFDGLMICCKSNHGQPRLPGASNAEYRLKEKFFMRKVQINDKDDTNEYKHAFEIILKDENSVIFSAKSAEEKNNWMAALISLQYRSTLERMLDVTMLQEEKEEQMRLPSADVYRFAEPDSEENIIFEENMQPKAGIPIIKAGTVIKLIERLTYHMYADPNFVRTFLTTYRSFCKPQELLSLIIERFEIPEPEPTEADRIAIENGDQPLSAELKRFRKEYIQPVQLRVLNVCRHWVEHHFYDFERDAYLLQRMEEFIGTVRGKAMKKWVESITKIIQRKKIARDNGPGHNITFQSSPPTVEWHISRPGHIETFDLLTLHPIEIARQLTLLESDLYRAVQPSELVGSVWTKEDKEINSPNLLKMIRHTTNLTLWFEKCIVETENLEERVAVVSRIIEILQVFQELNNFNGVLEVVSAMNSSPVYRLDHTFEQIPSRQKKILEEAHELSEDHYKKYLAKLRSINPPCVPFFGIYLTNILKTEEGNPEVLKRHGKELINFSKRRKVAEITGEIQQYQNQPYCLRVESDIKRFFENLNPMGNSMEKEFTDYLFNKSLEIEPRNPKPLPRFPKKYSYPLKSPGVRPSNPRPGTMRHPTPLQQEPRKISYSRIPESETESTASAPNSPRTPLTPPPASGASSTTDVCSVFDSDHSSPFHSSNDTVFIQVTLPHGPRSASVSSISLTKGTDEVPVPPPVPPRRRPESAPAESSPSKIMSKHLDSPPAIPPRQPTSKAYSPRYSISDRTSISDPPESPPLLPPREPVRTPDVFSSSPLHLQPPPLGKKSDHGNAFFPNSPSPFTPPPPQTPSPHGTRRHLPSPPLTQEVDLHSIAGPPVPPRQSTSQHIPKLPPKTYKREHTHPSMHRDGPPLLENAHSS
PDB_seq 100 # pdb to itself would obviously have 100% identity
Uniprot_seq 24 # pdb sequence has 24% identity to the uniprot sequence
For this to work on you input file, you need to put my a.get_seq() in a for loop with the inputs from your text file.
EDIT:
Replace the seq_id function with this one:
def seq_id(self,aln_fasta):
import string
from Bio import AlignIO
from Bio import SeqIO
record_iterator = SeqIO.parse(aln_fasta, "fasta")
first_record = record_iterator.next()
print '%s has a length of %d' % (first_record.id, len(str(first_record.seq).replace('-','')))
second_record = record_iterator.next()
print '%s has a length of %d' % (second_record.id, len(str(second_record.seq).replace('-','')))
lengths = [len(str(first_record.seq).replace('-','')), len(str(second_record.seq).replace('-',''))]
if lengths.index(min(lengths)) == 0: # If both sequences have the same length the PDB sequence will be taken as the shortest
print 'PDB sequence has the shortest length'
else:
print 'Uniport sequence has the shortes length'
idenities = 0
for i,v in enumerate(first_record.seq):
if v == '-':
pass
#print i,v, second_record.seq[i]
if v == second_record.seq[i]:
idenities +=1
#print i,v, second_record.seq[i], idenities
print 'Sequence Idenity = %.2f percent' % (100.0*(idenities/min(lengths)))
to pass the arguments to the class use:
with open('input_file.txt', 'r') as infile:
next(infile)
next(infile) # Going by your input file
for line in infile:
line = line.split()
a.get_seq(segs[0]+'.pdb',segs[1]+'.fasta')
It might be something like this; a repeatable example (e.g., with short files posted on-line) would help...
library(Biostrings)
pdb = readAAStringSet("pdb.fasta")
uniprot = readAAStringSet("uniprot.fasta")
to input all sequences into two objects. pairwiseAlignment accepts a vector as first (query) argument, so if you were wanting to align all pdb against all uniprot pre-allocate a result matrix
pids = matrix(numeric(), length(uniprot), length(pdb),
dimnames=list(names(uniprot), names(pdb)))
and then do the calculations
for (i in seq_along(uniprot)) {
globalAlignment = pairwiseAlignment(pdb, uniprot[i])
pids[i,] = pid(globalAlignment)
}

Resources