[PYTHON] Graduate School of Information Science and Technology, The University of Tokyo, Department of Creative Informatics, Summer 2015 Programming Exam

This is an example of the answer to the 2015 summer hospital exam.

The content of the description is personally solved by the author and does not guarantee the correct answer, and is not related to the University of Tokyo or the organization involved in providing the content of this examination.

Question theme

--String search

Problem statement

If the University of Tokyo points out, the problem statement will be deleted.

(1)

def solve1(file_path):
    cnt = 0
    with open(file_path, 'r') as f:
        txt = f.read()
        if txt[-1] == '\n' and len(txt) > 1:
                txt = txt[:-1]
        for ch in txt:
            if ch == ';':
                cnt += 1
    return cnt

(2)

def solve2(file_path):
    with open(file_path, 'r') as f:
        txts = f.readlines()
    ret = []
    for index, txt in enumerate(txts):
        if 'main' in txt:
            print('{0}Line: {1}'.format(index+1, txt))

(3)

def solve3(file_path):
    with open(file_path, 'r') as f:
        text = f.readlines()
        txts = []
        for index, txt in enumerate(text):
            if txt[-1] == '\n':
                txts.append(txt[:-1])
            else:
                txts.append(txt)
        s = set()
        for index, txt in enumerate(txts):
            if index < len(txts) - 1:
                next_txt = txts[index + 1]
                if txt == next_txt:
                    s.add(txt)                
    for txt in s:
        print(txt)

(4)

class Line(object):
    def __init__(self, txt, initial_appear, appear_times):
        self.txt = txt
        self.initial_appear = initial_appear
        self.appear_times = appear_times
    
    def __lt__(self, line):
        self.initial_appear < line.initial_appear
        
    def __repr__(self):
        return 'First appearance line: {0},Number of appearances: {1}, txt: {2}'.format(self.initial_appear, self.appear_times, self.txt)

def solve4(file_path):
    with open(file_path, 'r') as f:        
        text = f.readlines()
        txts = []
        for index, txt in enumerate(text):
            if txt[-1] == '\n':
                txts.append(txt[:-1])
            else:
                txts.append(txt)
                
        dic = {} # 'txt': [First appearance line,Number of appearances]
        for index, txt in enumerate(txts):
            if txt in dic.keys():
                dic[txt][1] += 1
            else:
                dic[txt] = [index+1, 1]
    ret = []
    for key in dic.keys():
        obj = dic[key]
        initial_appear = obj[0]
        appear_times = obj[1]
        if appear_times > 1:
            ret.append(Line(key, initial_appear, appear_times))
    
    sorted(ret)
    for line in ret:
        print(line)

(5)

def formatnptxt(nparray):
    txt = ''
    for ch in nparray:
        txt += ch
    return txt    

def solve5(file_path, minimum_len):
     with open(file_path, 'r') as f:        
        text = f.readlines()
        txts = []
        max_len = 0
        for index, txt in enumerate(text):
            if txt[-1] == '\n':
                if len(txt[:-1]) >= minimum_len:
                    txts.append(txt[:-1])
                    max_len = max(max_len, len(txt[:-1]))
            else:
                if len(txt) >= minimum_len:
                    txts.append(txt)
                    max_len = max(max_len, len(txt))                    

        formatted_txts = np.array([[' ' for _ in range(max_len)] for _ in range(len(txts))])
        for i, txt in enumerate(txts):
            for j, ch in enumerate(txt):
                formatted_txts[i, j] = ch
        ret = set()
        sames = 0
        for i in range(0, len(formatted_txts) - 1):
            txt1 = formatted_txts[i]
            for j in range(i+1, len(formatted_txts)):
                txt2 = formatted_txts[j]
                booleans = (txt1 == txt2)
                same_scores = booleans.sum()
                if same_scores < max_len and (max_len - same_scores) < 5:
                    sames += 1
                    pair1 = '{0}\n{1}'.format(formatnptxt(txt1), formatnptxt(txt2))
                    pair2 = '{0}\n{1}'.format(formatnptxt(txt2), formatnptxt(txt1))                    
                    if (not (pair1 in ret)) and (not (pair2 in ret)):
                        ret.add(pair1)
                    
        for set_ele in ret:
            set_ele_array = set_ele.split('\n')
            print('{0}, {1}'.format(set_ele_array[0], set_ele_array[1]))
        print(sames)

(6)

from LCS import LCS

def solve6(file_path, minimu_len):
    with open(file_path, 'r') as f:
        text = f.readlines()
        txts = []
        for index, txt in enumerate(text):
            if txt[-1] == '\n':
                if len(txt[:-1]) >= minimu_len:
                    txts.append(txt[:-1])
            else:
                if len(txt) >= minimu_len:
                    txts.append(txt)

        s = set()
        sames = 0
        for i in range(len(txts) - 1):
            txt1 = txts[i]
            txt1_len = len(txt1)
            for j in range(i + 1, len(txts)):
                txt2 = txts[j]
                txt2_len = len(txt2)
                if txt1 != txt2:
                    lcs = LCS(txt1, txt2)
                    lcs_len = len(lcs)
                    diff = abs(txt1_len - txt2_len) + min(txt1_len - lcs_len, txt2_len - lcs_len)
                    if diff < 4:
                        sames += 1
                        pair1 = '{0}\n{1}'.format(txt1, txt2)
                        pair2 = '{0}\n{1}'.format(txt2, txt1)
                        if (not (pair1 in s)) and (not (pair2 in s)):
                            s.add(pair1)
        for set_ele in s:
            set_ele_array = set_ele.split('\n')
            print('{0}, {1}'.format(set_ele_array[0], set_ele_array[1]))
        print(sames)

(7) I didn't finish solving in time ...

I thought about how to do it, but if it is n lines Search in order by 4,5,6 ... n / 2 block size, for example, when 4 and [1,4] and [12,15] match and 5 If [1,5] and [12,16] match, I feel like I can do it like deleting the former.

Impressions

-In (6), the conversion cost from text1 to text2 or vice versa is the same, so here, from the perspective of recreating the shorter one to the longer one, first look at the LCS (longest common subsequence) and the difference from the shorter one ( If min (txt1_len --lcs_len, txt2_len --lcs_len)) is first remade according to the longer one, a character string with the shorter length is created, and then the character number difference abs (txt1_len --txt2_len) with the longer one is added. Since it can be done by doing this, the content on the code is the minimum number of steps. (Maybe lol) -(7) was purely short of time ...