dailylog 4-03-20

less than 1 minute read

To be converted to a daily script


import camelot
import requests
import pandas as pd

url = "https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200403-sitrep-74-covid-19-mp.pdf?"
# url = 'http://www.hrecos.org//images/Data/forweb/HRTVBSH.Metadata.pdf'
r = requests.get(url, stream=True)



# camelot --format csv --output ./foo.csv --pages 1-end lattice r.content
chunk_size = 2000
with open('metadata_3.pdf', 'wb') as fd:
    for chunk in r.iter_content(chunk_size):
        fd.write(chunk)

tables = camelot.read_pdf('metadata_3.pdf', pages='1-end')

big_df = []
for table in tables:
	df = table.df
	print('*****', df.shape)

	if df.shape[1] >= 7:
		num = df.shape[1]

		df.columns = list(range(0,num))
		if num > 7:
			print('+++++++',num)
			print(df.head())
			smdf = df.drop(1, axis=1)
			smdf.columns = list(range(0,smdf.shape[1]))
			big_df.append(smdf)
		else:
			big_df.append(df)
		

frame = pd.concat(big_df, axis=0, ignore_index=True)
frame.to_csv('frametest_9.csv', index=False)
# print(frame)
# frame[0]
# if 

Tags:

Updated: