1. Cleaning the data
,,2012 SAT Results,,, DBN,SCHOOL NAME,# of SAT Test Takers,SAT Critical Reading Avg.,SAT Math Avg.,SAT Writing Avg. 02M047,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECONDARY SCHOOL,16,395,400,387 06M540,A. PHILIP RANDOLPH CAMPUS HIGH SCHOOL,228,430,456,423 21K410,ABRAHAM LINCOLN HIGH SCHOOL,475,396,437,393 21K412/21K411,ABRAHAM LINCOLN YABC/LEARNING TO WORK GED AT ABRAHAM LINCOLN,s,s,s,s 30Q301,ACADEMY FOR CAREERS IN TELEVISION AND FILM,98,410,440,405 17K382,ACADEMY FOR COLLEGE PREPARATION AND CAREER EXPLORATION: A COLLEGE BOARD SCHOOL,59,396,374,369 18K637,ACADEMY FOR CONSERVATION AND THE ENVIRONMENT,35,363,381,367 32K403,ACADEMY FOR ENVIRONMENTAL LEADERSHIP,50,380,366,352 17K751,ACADEMY FOR HEALTH CAREERS,s,s,s,s 09X365,ACADEMY FOR LANGUAGE AND TECHNOLOGY,54,315,339,297 11X270,ACADEMY FOR SCHOLARSHIP AND ENTREPRENEURSHIP: A COLLEGE BOARD SCHOOL,56,386,394,361 05M367,ACADEMY FOR SOCIAL ACTION: A COLLEGE BOARD SCHOOL,33,354,366,342 14K404,ACADEMY FOR YOUNG WRITERS,68,387,357,381 30Q575,ACADEMY OF AMERICAN STUDIES,135,487,492,491 13K336,ACADEMY OF BUSINESS AND COMMUNITY DEVELOPMENT,9,439,374,418 04M635,ACADEMY OF ENVIRONMENTAL SCIENCE SECONDARY HIGH SCHOOL,48,369,369,357 24Q264,ACADEMY OF FINANCE AND ENTERPRISE,89,405,454,421 17K408,ACADEMY OF HOSPITALITY AND TOURISM,57,352,342,351 19K618,ACADEMY OF INNOVATIVE TECHNOLOGY,60,371,371,354 27Q309,ACADEMY OF MEDICAL TECHNOLOGY: A COLLEGE BOARD SCHOOL,36,367,379,348 32K552,ACADEMY OF URBAN PLANNING,67,342,364,354 13K499,ACORN COMMUNITY HIGH SCHOOL,72,384,364,368 07X600,ALFRED E. SMITH CAREER AND TECHNICAL EDUCATION HIGH SCHOOL,76,394,400,364 32K554,ALL CITY LEADERSHIP SECONDARY SCHOOL,39,428,465,422 08X376,"ANTONIA PANTOJA PREPARATORY ACADEMY, A COLLEGE BOARD SCHOOL",s,s,s,s 08X367,"ARCHIMEDES ACADEMY FOR MATH, SCIENCE AND TECHNOLOGY APPLICATIONS",s,s,s,s 02M630,ART AND DESIGN HIGH SCHOOL,270,444,441,430 ... 09X505,"BRONX SCHOOL FOR LAW, GOVERNMENT AND JUSTICE",80,404,418,402
def CleanSATData(infile, outfile): '''Read and clean the SAT data, then write it out''' try: fin = open(infile,'r') raw_string = fin.read() raw_lines = raw_string.split('\n') fin.close() except: print 'ERROR: cannot read '+infile return clean_lines = [] # the cleaned data lines for raw_line in raw_lines: raw_line = raw_line.strip() # in case there are trailing '\r' chars fields = raw_line.split(',') # Reject if there aren't enough fields if len(fields) < 6: continue # Reject if any one of the last 4 fields is not integers allFieldsNumeric = True for i in [-4,-3,-2,-1]: if not fields[i].isdigit(): allFieldsNumeric = False if not allFieldsNumeric: continue # now deal with the school names inside double-quotes: if '"' in raw_line: pos_1 = raw_line.find('"') # find the positions of the quotes pos_2 = raw_line.find('"',pos_1+1) name = raw_line[pos_1+1:pos_2] # extract the school name name = name.replace(',',';') # change the school name raw_line = raw_line[:pos_1] + name + raw_line[pos_2+1:] # replace name, and removing quotes fields = raw_line.split(',') # re-split the line into fields # create the clean line clean_line = ','.join(fields) clean_lines.append(clean_line) # write it out try: fout = open(outfile,'w') fout.write('\n'.join(clean_lines)) fout.close() except: print 'ERROR: cannot write to '+outfile
def Create_dSchools(cleanFile): try: fin = open(cleanFile,'r') lines = fin.read().split('\n') fin.close() except: print 'ERROR: cannot read '+cleanFile return dSchools={} for line in lines: fields = line.split(',') # calculate total SAT score totalSAT = 0 for i in [-3,-2,-1]: totalSAT += int(fields[i]) dSchools[fields[1]] = totalSAT return dSchools
def DictInvert(indict): ''' Invert the given dictionary and return it''' outdict={} for key in indict: value = indict[key] if value in outdict: outdict[value].append(key) else: outdict[value] = [key] return outdict
def PrintSATSchools(cleanFile): dSchools = Create_dSchools(cleanFile) dSAT = DictInvert(dSchools) keys = dSAT.keys() keys.sort() print 'Highest SAT: ',keys[-1],dSAT[keys[-1]] median = len(keys)/2 print 'Median SAT: ',keys[median],dSAT[keys[median]] print 'Lowest SAT: ',keys[0],dSAT[keys[0]]