# -*- coding: utf-8 -*- """ @author: Maxime Boucher """ #Import packages import zipfile import requests import pandas as pd #Download zip files url = 'https://lobbycanada.gc.ca/od-do/Communications_OCL_CAL.zip' #Write and save .zip file r = requests.get(url, allow_redirects=True) open('Communications_OCL_CAL.zip', 'wb').write(r.content) #Unzip all files in directory of your choice with zipfile.ZipFile('Communications_OCL_CAL.zip', 'r') as zip_ref: zip_ref.extractall(r'Enter your output directory') name_cols = ['COMLOG_ID',"DPOH_LAST_NM_TCPD","DPOH_FIRST_NM_PRENOM_TCPD","DPOH_TITLE_TITRE_TCPD","BRANCH_UNIT_DIRECTION_SERVICE","OTHER_INSTITUTION_AUTRE","INSTITUTION"] pull_cols = ['COMLOG_ID',"DPOH_LAST_NM_TCPD","DPOH_FIRST_NM_PRENOM_TCPD","DPOH_TITLE_TITRE_TCPD","BRANCH_UNIT_DIRECTION_SERVICE","OTHER_INSTITUTION_AUTRE","INSTITUTION"] df1 = pd.read_csv(r'Enter your directory', header=None, encoding="ISO-8859-1", names=name_cols, usecols=pull_cols, index_col='COMLOG_ID') name_cols = ['COMLOG_ID',"CLIENT_ORG_CORP_NUM","EN_CLIENT_ORG_CORP_NM_AN","FR_CLIENT_ORG_CORP_NM","REGISTRANT_NUM_DECLARANT","RGSTRNT_LAST_NM_DCLRNT","RGSTRNT_1ST_NM_PRENOM_DCLRNT","COMM_DATE","REG_TYPE_ENR","SUBMISSION_DATE_SOUMISSION","POSTED_DATE_PUBLICATION","PREV_COMLOG_ID_PRECEDNT"] pull_cols = ['COMLOG_ID',"CLIENT_ORG_CORP_NUM","EN_CLIENT_ORG_CORP_NM_AN","FR_CLIENT_ORG_CORP_NM","REGISTRANT_NUM_DECLARANT","RGSTRNT_LAST_NM_DCLRNT","RGSTRNT_1ST_NM_PRENOM_DCLRNT","COMM_DATE","REG_TYPE_ENR","SUBMISSION_DATE_SOUMISSION","POSTED_DATE_PUBLICATION","PREV_COMLOG_ID_PRECEDNT"] df2 = pd.read_csv(r'Enter your directory', header=None, encoding="ISO-8859-1", names=name_cols, usecols=pull_cols, index_col=False) #merging dataframe df3 = pd.merge(df1, df2, how='left', on=['COMLOG_ID']) df3.to_excel(r'Enter your directory', index=False) #Dropping duplicated entries df5 = pd.read_excel(r'Enter your directory') df6 = df5.drop_duplicates(keep=False) df6.to_excel(r'Enter your directory', index=False)