hdx.utilities.downloader
Downloading utilities for urls.
Download Objects
class Download(BaseDownload)
Download class with various download operations. Requires either global user agent to be set or appropriate user agent parameter(s) to be completed.
Arguments:
user_agent
Optional[str] - User agent string. HDXPythonUtilities/X.X.X- is prefixed.user_agent_config_yaml
Optional[str] - Path to YAML user agent configuration. Ignored if user_agent supplied. Defaults to ~/.useragent.yaml.user_agent_lookup
Optional[str] - Lookup key for YAML. Ignored if user_agent supplied.use_env
bool - Whether to read environment variables. Defaults to True.fail_on_missing_file
bool - Raise an exception if any specified configuration files are missing. Defaults to True.verify
bool - Whether to verify SSL certificates. Defaults to True.rate_limit
Optional[Dict] - Rate limiting per host eg. {"calls": 1, "period": 0.1}. Defaults to None.**kwargs
- See belowauth
Tuple[str, str] - Authorisation information in tuple form (user, pass) ORbasic_auth
str - Authorisation information in basic auth string form (Basic xxxxxxxxxxxxxxxx) ORbasic_auth_file
str - Path to file containing authorisation information in basic auth string form (Basic xxxxxxxxxxxxxxxx)bearer_token
str - Bearer token string ORbearer_token_file
str - Path to file containing bearer token string ORextra_params_dict
Dict[str, str] - Extra parameters to put on end of url as a dictionary ORextra_params_json
str - Path to JSON file containing extra parameters to put on end of url ORextra_params_yaml
str - Path to YAML file containing extra parameters to put on end of urlextra_params_lookup
str - Lookup key for parameters. If not given assumes parameters are at root of the dict.headers
Dict - Additional headers to add to request.use_auth
str - If more than one auth found, specify which one to use, rather than failing.status_forcelist
ListTuple[int] - HTTP statuses for which to force retryallowed_methods
iterable - HTTP methods for which to force retry. Defaults t0 frozenset(['GET']).
close_response
def close_response() -> None
Close response.
Returns:
None
close
def close() -> None
Close response and session.
Returns:
None
__exit__
def __exit__(exc_type: Any, exc_value: Any, traceback: Any) -> None
Allow usage of with.
Arguments:
exc_type
Any - Exception typeexc_value
Any - Exception valuetraceback
Any - Traceback
Returns:
None
get_path_for_url
@staticmethod
def get_path_for_url(url: str,
folder: Optional[str] = None,
filename: Optional[str] = None,
path: Optional[str] = None,
overwrite: bool = False,
keep: bool = False) -> str
Get filename from url and join to provided folder or temporary folder if no folder supplied, ensuring uniqueness.
Arguments:
url
str - URL to downloadfolder
Optional[str] - Folder to download it to. Defaults to None (temporary folder).filename
Optional[str] - Filename to use for downloaded file. Defaults to None (derive from the url).path
Optional[str] - Full path to use for downloaded file. Defaults to None (use folder and filename).overwrite
bool - Whether to overwrite existing file. Defaults to False.keep
bool - Whether to keep already downloaded file. Defaults to False.
Returns:
str
- Path of downloaded file
get_full_url
def get_full_url(url: str) -> str
Get full url including any additional parameters.
Arguments:
url
str - URL for which to get full url
Returns:
str
- Full url including any additional parameters
get_url_for_get
@staticmethod
def get_url_for_get(url: str, parameters: Optional[Dict] = None) -> str
Get full url for GET request including parameters.
Arguments:
url
str - URL to downloadparameters
Optional[Dict] - Parameters to pass. Defaults to None.
Returns:
str
- Full url
get_url_params_for_post
@staticmethod
def get_url_params_for_post(url: str,
parameters: Optional[Dict] = None
) -> Tuple[str, Dict]
Get full url for POST request and all parameters including any in the url.
Arguments:
url
str - URL to downloadparameters
Optional[Dict] - Parameters to pass. Defaults to None.
Returns:
Tuple[str, Dict]: (Full url, parameters)
hxl_row
@staticmethod
def hxl_row(headers: ListTuple[str],
hxltags: Dict[str, str],
dict_form: bool = False) -> Union[List[str], Dict[str, str]]
Return HXL tag row for header row given list of headers and dictionary with header to HXL hashtag mappings. Return list or dictionary depending upon the dict_form argument.
Arguments:
headers
ListTuple[str] - Headers for which to get HXL hashtagshxltags
Dict[str,str] - Header to HXL hashtag mappingdict_form
bool - Return dict or list. Defaults to False (list)
Returns:
Union[List[str],Dict[str,str]]
- Return either a list or dictionary conating HXL hashtags
normal_setup
def normal_setup(url: str,
stream: bool = True,
post: bool = False,
parameters: Optional[Dict] = None,
timeout: Optional[float] = None,
headers: Optional[Dict] = None,
encoding: Optional[str] = None) -> requests.Response
Setup download from provided url returning the response.
Arguments:
url
str - URL or path to downloadstream
bool - Whether to stream download. Defaults to True.post
bool - Whether to use POST instead of GET. Defaults to False.parameters
Optional[Dict] - Parameters to pass. Defaults to None.timeout
Optional[float] - Timeout for connecting to URL. Defaults to None (no timeout).headers
Optional[Dict] - Headers to pass. Defaults to None.encoding
Optional[str] - Encoding to use for text response. Defaults to None (best guess).
Returns:
requests.Response
- requests.Response object
set_bearer_token
def set_bearer_token(bearer_token: str) -> None
Set bearer token
Arguments:
bearer_token
str - Bearer token
Returns:
None
hash_stream
def hash_stream(url: str) -> str
Stream file from url and hash it using MD5. Must call setup method first.
Arguments:
url
str - URL or path to download
Returns:
str
- MD5 hash of file
stream_path
def stream_path(path: str, errormsg: str)
Stream file from url and store in provided path. Must call setup method first.
Arguments:
path
str - Path for downloaded fileerrormsg
str - Error message to display if there is a problem
Returns:
str
- Path of downloaded file
stream_file
def stream_file(url: str,
folder: Optional[str] = None,
filename: Optional[str] = None,
path: Optional[str] = None,
overwrite: bool = False,
keep: bool = False) -> str
Stream file from url and store in provided folder or temporary folder if no folder supplied. Must call setup method first.
Arguments:
url
str - URL or path to downloadfolder
Optional[str] - Folder to download it to. Defaults to None (temporary folder).filename
Optional[str] - Filename to use for downloaded file. Defaults to None (derive from the url).path
Optional[str] - Full path to use for downloaded file. Defaults to None (use folder and filename).overwrite
bool - Whether to overwrite existing file. Defaults to False.keep
bool - Whether to keep already downloaded file. Defaults to False.
Returns:
str
- Path of downloaded file
download_file
def download_file(url: str, **kwargs: Any) -> str
Download file from url and store in provided folder or temporary folder if no folder supplied.
Arguments:
url
str - URL or path to download**kwargs
- See belowfolder
str - Folder to download it to. Defaults to temporary folder.filename
str - Filename to use for downloaded file. Defaults to deriving from url.path
str - Full path to use for downloaded file instead of folder and filename.overwrite
bool - Whether to overwrite existing file. Defaults to False.keep
bool - Whether to keep already downloaded file. Defaults to False.post
bool - Whether to use POST instead of GET. Defaults to False.parameters
Dict - Parameters to pass. Defaults to None.timeout
float - Timeout for connecting to URL. Defaults to None (no timeout).headers
Dict - Headers to pass. Defaults to None.encoding
str - Encoding to use for text response. Defaults to None (best guess).
Returns:
str
- Path of downloaded file
download
def download(url: str, **kwargs: Any) -> requests.Response
Download url.
Arguments:
url
str - URL or path to download**kwargs
- See belowpost
bool - Whether to use POST instead of GET. Defaults to False.parameters
Dict - Parameters to pass. Defaults to None.timeout
float - Timeout for connecting to URL. Defaults to None (no timeout).headers
Dict - Headers to pass. Defaults to None.encoding
str - Encoding to use for text response. Defaults to None (best guess).
Returns:
requests.Response
- Response
get_header
def get_header(header: str) -> Any
Get a particular response header of download.
Arguments:
header
str - Header for which to get value
Returns:
Any
- Response header's value
get_headers
def get_headers() -> Any
Get response headers of download.
Returns:
Any
- Response headers
get_status
def get_status() -> int
Get response status code.
Returns:
int
- Response status code
get_text
def get_text() -> str
Get text content of download.
Returns:
str
- Text content of download
get_yaml
def get_yaml() -> Any
Get YAML content of download.
Returns:
Any
- YAML content of download
get_json
def get_json() -> Any
Get JSON content of download.
Returns:
Any
- JSON content of download
download_text
def download_text(url: str, **kwargs: Any) -> str
Download url as text.
Arguments:
url
str - URL or path to download**kwargs
- See belowpost
bool - Whether to use POST instead of GET. Defaults to False.parameters
Dict - Parameters to pass. Defaults to None.timeout
float - Timeout for connecting to URL. Defaults to None (no timeout).headers
Dict - Headers to pass. Defaults to None.encoding
str - Encoding to use for text response. Defaults to None (best guess).
Returns:
str
- Text content of download
download_yaml
def download_yaml(url: str, **kwargs: Any) -> Any
Download url as YAML.
Arguments:
url
str - URL or path to download**kwargs
- See belowpost
bool - Whether to use POST instead of GET. Defaults to False.parameters
Dict - Parameters to pass. Defaults to None.timeout
float - Timeout for connecting to URL. Defaults to None (no timeout).headers
Dict - Headers to pass. Defaults to None.encoding
str - Encoding to use for text response. Defaults to None (best guess).
Returns:
str
- YAML content of download
download_json
def download_json(url: str, **kwargs: Any) -> Any
Download url as JSON.
Arguments:
url
str - URL or path to download**kwargs
- See belowpost
bool - Whether to use POST instead of GET. Defaults to False.parameters
Dict - Parameters to pass. Defaults to None.timeout
float - Timeout for connecting to URL. Defaults to None (no timeout).headers
Dict - Headers to pass. Defaults to None.encoding
str - Encoding to use for text response. Defaults to None (best guess).
Returns:
str
- JSON content of download
get_frictionless_tableresource
def get_frictionless_tableresource(url: str,
ignore_blank_rows: bool = True,
infer_types: bool = False,
**kwargs: Any) -> TableResource
Get Frictionless TableResource.
Arguments:
url
str - URL or path to downloadignore_blank_rows
bool - Whether to ignore blank rows. Defaults to True.infer_types
bool - Whether to infer types. Defaults to False (strings). **kwargs:has_header
bool - Whether data has a header. Defaults to True.headers
Union[int, ListTuple[int], ListTuple[str]] - Number of row(s) containing headers or list of headerscolumns
Union[ListTuple[int], ListTuple[str], None] - Columns to pick. Defaults to all.format
Optional[str] - Type of file. Defaults to inferring.file_type
Optional[str] - Type of file. Defaults to inferring.encoding
Optional[str] - Type of encoding. Defaults to inferring.compression
Optional[str] - Type of compression. Defaults to inferring.delimiter
Optional[str] - Delimiter for values in csv rows. Defaults to inferring.skip_initial_space
bool - Ignore whitespace straight after delimiter. Defaults to False.sheet
Optional[Union[int, str] - Sheet in Excel. Defaults to inferring.fill_merged_cells
bool - Whether to fill merged cells. Defaults to True.http_session
Session - Session object to use. Defaults to downloader session.columns
Union[ListTuple[int], ListTuple[str], None] - Columns to pick. Defaults to all.default_type
Optional[str] - Default field type if infer_types False. Defaults to string.float_numbers
bool - Use float not Decimal if infer_types True. Defaults to True.null_values
List[Any] - Values that will return None. Defaults to [""].dialect
Dialect - This can be set to override the above. See Frictionless docs.detector
Detector - This can be set to override the above. See Frictionless docs.layout
Layout - This can be set to override the above. See Frictionless docs.schema
Schema - This can be set to override the above. See Frictionless docs.
Returns:
TableResource
- frictionless TableResource object
get_tabular_rows
def get_tabular_rows(url: Union[str, ListTuple[str]],
has_hxl: bool = False,
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
dict_form: bool = False,
include_headers: bool = False,
ignore_blank_rows: bool = True,
infer_types: bool = False,
header_insertions: Optional[ListTuple[Tuple[int,
str]]] = None,
row_function: Optional[Callable[[List[str], ListDict],
ListDict]] = None,
**kwargs: Any) -> Tuple[List[str], Iterator[ListDict]]
Returns header of tabular file(s) pointed to by url and an iterator where each row is returned as a list or dictionary depending on the dict_rows argument.
When a list of urls is supplied (in url), then the has_hxl flag indicates if the files are HXLated so that the HXL row is only included from the first file. The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined as a list of strings. It defaults to 1. The dict_form argument specifies if each row should be returned as a dictionary or a list, defaulting to a list.
Optionally, headers can be inserted at specific positions. This is achieved using the header_insertions argument. If supplied, it is a list of tuples of the form (position, header) to be inserted. A function is called for each row. If supplied, it takes as arguments: headers (prior to any insertions) and row (which will be in dict or list form depending upon the dict_rows argument) and outputs a modified row or None to ignore the row.
Arguments:
url
Union[str, ListTuple[str]] - A single or list of URLs or paths to read fromhas_hxl
bool - Whether files have HXL hashtags. Ignored for single url. Defaults to False.headers
Union[int, ListTuple[int], ListTuple[str]] - Number of row(s) containing headers or list of headers. Defaults to 1.dict_form
bool - Return dict or list for each row. Defaults to False (list)include_headers
bool - Whether to include headers in iterator. Defaults to False.ignore_blank_rows
bool - Whether to ignore blank rows. Defaults to True.infer_types
bool - Whether to infer types. Defaults to False (strings).header_insertions
Optional[ListTuple[Tuple[int,str]]] - List of (position, header) to insert. Defaults to None.row_function
Optional[Callable[[List[str],ListDict],ListDict]] - Function to call for each row. Defaults to None. **kwargs:format
Optional[str] - Type of file. Defaults to inferring.file_type
Optional[str] - Type of file. Defaults to inferring.encoding
Optional[str] - Type of encoding. Defaults to inferring.compression
Optional[str] - Type of compression. Defaults to inferring.delimiter
Optional[str] - Delimiter for values in csv rows. Defaults to inferring.skip_initial_space
bool - Ignore whitespace straight after delimiter. Defaults to False.sheet
Optional[Union[int, str] - Sheet in Excel. Defaults to inferring.fill_merged_cells
bool - Whether to fill merged cells. Defaults to True.http_session
Session - Session object to use. Defaults to downloader session.columns
Union[ListTuple[int], ListTuple[str], None] - Columns to pick. Defaults to all.default_type
Optional[str] - Default field type if infer_types False. Defaults to string.float_numbers
bool - Use float not Decimal if infer_types True. Defaults to True.null_values
List[Any] - Values that will return None. Defaults to [""].dialect
Dialect - This can be set to override the above. See Frictionless docs.detector
Detector - This can be set to override the above. See Frictionless docs.layout
Layout - This can be set to override the above. See Frictionless docs.schema
Schema - This can be set to override the above. See Frictionless docs.
Returns:
Tuple[List[str],Iterator[ListDict]]
- Tuple (headers, iterator where each row is a list or dictionary)
get_tabular_rows_as_list
def get_tabular_rows_as_list(
url: Union[str, ListTuple[str]],
has_hxl: bool = False,
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
include_headers: bool = True,
ignore_blank_rows: bool = True,
infer_types: bool = False,
header_insertions: Optional[ListTuple[Tuple[int, str]]] = None,
row_function: Optional[Callable[[List[str], ListDict],
ListDict]] = None,
**kwargs: Any) -> Tuple[List[str], Iterator[List]]
Returns headers and an iterator where each row is returned as a list.
When a list of urls is supplied (in url), then the has_hxl flag indicates if the files are HXLated so that the HXL row is only included from the first file. The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined as a list of strings. It defaults to 1 and cannot be None.
Optionally, headers can be inserted at specific positions. This is achieved using the header_insertions argument. If supplied, it is a list of tuples of the form (position, header) to be inserted. A function is called for each row. If supplied, it takes as arguments: headers (prior to any insertions) and row (which will be in dict or list form depending upon the dict_rows argument) and outputs a modified row or None to ignore the row.
Arguments:
url
Union[str, ListTuple[str]] - A single or list of URLs or paths to read fromhas_hxl
bool - Whether files have HXL hashtags. Ignored for single url. Defaults to False.headers
Union[int, ListTuple[int], ListTuple[str]] - Number of row(s) containing headers or list of headers. Defaults to 1.include_headers
bool - Whether to include headers in iterator. Defaults to True.ignore_blank_rows
bool - Whether to ignore blank rows. Defaults to True.infer_types
bool - Whether to infer types. Defaults to False (strings).header_insertions
Optional[ListTuple[Tuple[int,str]]] - List of (position, header) to insert. Defaults to None.row_function
Optional[Callable[[List[str],ListDict],ListDict]] - Function to call for each row. Defaults to None. **kwargs:format
Optional[str] - Type of file. Defaults to inferring.file_type
Optional[str] - Type of file. Defaults to inferring.encoding
Optional[str] - Type of encoding. Defaults to inferring.compression
Optional[str] - Type of compression. Defaults to inferring.delimiter
Optional[str] - Delimiter for values in csv rows. Defaults to inferring.skip_initial_space
bool - Ignore whitespace straight after delimiter. Defaults to False.sheet
Optional[Union[int, str] - Sheet in Excel. Defaults to inferring.fill_merged_cells
bool - Whether to fill merged cells. Defaults to True.http_session
Session - Session object to use. Defaults to downloader session.columns
Union[ListTuple[int], ListTuple[str], None] - Columns to pick. Defaults to all.default_type
Optional[str] - Default field type if infer_types False. Defaults to string.float_numbers
bool - Use float not Decimal if infer_types True. Defaults to True.null_values
List[Any] - Values that will return None. Defaults to [""].dialect
Dialect - This can be set to override the above. See Frictionless docs.detector
Detector - This can be set to override the above. See Frictionless docs.layout
Layout - This can be set to override the above. See Frictionless docs.schema
Schema - This can be set to override the above. See Frictionless docs.
Returns:
Tuple[List[str],Iterator[List]]
- Tuple (headers, iterator where each row is a list)
get_tabular_rows_as_dict
def get_tabular_rows_as_dict(
url: Union[str, ListTuple[str]],
has_hxl: bool = False,
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
ignore_blank_rows: bool = True,
infer_types: bool = False,
header_insertions: Optional[ListTuple[Tuple[int, str]]] = None,
row_function: Optional[Callable[[List[str], ListDict],
ListDict]] = None,
**kwargs: Any) -> Tuple[List[str], Iterator[Dict]]
Returns headers and an iterator where each row is returned as a dictionary.
When a list of urls is supplied (in url), then the has_hxl flag indicates if the files are HXLated so that the HXL row is only included from the first file. The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined as a list of strings. It defaults to 1 and cannot be None.
Optionally, headers can be inserted at specific positions. This is achieved using the header_insertions argument. If supplied, it is a list of tuples of the form (position, header) to be inserted. A function is called for each row. If supplied, it takes as arguments: headers (prior to any insertions) and row (which will be in dict or list form depending upon the dict_rows argument) and outputs a modified row or None to ignore the row.
Arguments:
url
Union[str, ListTuple[str]] - A single or list of URLs or paths to read fromhas_hxl
bool - Whether files have HXL hashtags. Ignored for single url. Defaults to False.headers
Union[int, ListTuple[int], ListTuple[str]] - Number of row(s) containing headers or list of headers. Defaults to 1.ignore_blank_rows
bool - Whether to ignore blank rows. Defaults to True.infer_types
bool - Whether to infer types. Defaults to False (strings).header_insertions
Optional[ListTuple[Tuple[int,str]]] - List of (position, header) to insert. Defaults to None.row_function
Optional[Callable[[List[str],ListDict],ListDict]] - Function to call for each row. Defaults to None. **kwargs:format
Optional[str] - Type of file. Defaults to inferring.file_type
Optional[str] - Type of file. Defaults to inferring.encoding
Optional[str] - Type of encoding. Defaults to inferring.compression
Optional[str] - Type of compression. Defaults to inferring.delimiter
Optional[str] - Delimiter for values in csv rows. Defaults to inferring.skip_initial_space
bool - Ignore whitespace straight after delimiter. Defaults to False.sheet
Optional[Union[int, str] - Sheet in Excel. Defaults to inferring.fill_merged_cells
bool - Whether to fill merged cells. Defaults to True.http_session
Session - Session object to use. Defaults to downloader session.columns
Union[ListTuple[int], ListTuple[str], None] - Columns to pick. Defaults to all.default_type
Optional[str] - Default field type if infer_types False. Defaults to string.float_numbers
bool - Use float not Decimal if infer_types True. Defaults to True.null_values
List[Any] - Values that will return None. Defaults to [""].dialect
Dialect - This can be set to override the above. See Frictionless docs.detector
Detector - This can be set to override the above. See Frictionless docs.layout
Layout - This can be set to override the above. See Frictionless docs.schema
Schema - This can be set to override the above. See Frictionless docs.
Returns:
Tuple[List[str], Iterator[Dict]]: Tuple (headers, iterator where each row is a dictionary)
download_tabular_key_value
def download_tabular_key_value(
url: Union[str, ListTuple[str]],
has_hxl: bool = False,
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
include_headers: bool = True,
ignore_blank_rows: bool = True,
infer_types: bool = False,
header_insertions: Optional[ListTuple[Tuple[int, str]]] = None,
row_function: Optional[Callable[[List[str], ListDict],
ListDict]] = None,
**kwargs: Any) -> Dict
Download 2 column csv from url and return a dictionary of keys (first column) and values (second column).
When a list of urls is supplied (in url), then the has_hxl flag indicates if the files are HXLated so that the HXL row is only included from the first file. The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined as a list of strings. It defaults to 1 and cannot be None.
Optionally, headers can be inserted at specific positions. This is achieved using the header_insertions argument. If supplied, it is a list of tuples of the form (position, header) to be inserted. A function is called for each row. If supplied, it takes as arguments: headers (prior to any insertions) and row (which will be in dict or list form depending upon the dict_rows argument) and outputs a modified row or None to ignore the row.
Arguments:
url
Union[str, ListTuple[str]] - A single or list of URLs or paths to read fromhas_hxl
bool - Whether files have HXL hashtags. Ignored for single url. Defaults to False.headers
Union[int, ListTuple[int], ListTuple[str]] - Number of row(s) containing headers or list of headers. Defaults to 1.include_headers
bool - Whether to include headers in iterator. Defaults to True.ignore_blank_rows
bool - Whether to ignore blank rows. Defaults to True.infer_types
bool - Whether to infer types. Defaults to False (strings).header_insertions
Optional[ListTuple[Tuple[int,str]]] - List of (position, header) to insert. Defaults to None.row_function
Optional[Callable[[List[str],ListDict],ListDict]] - Function to call for each row. Defaults to None. **kwargs:format
Optional[str] - Type of file. Defaults to inferring.file_type
Optional[str] - Type of file. Defaults to inferring.encoding
Optional[str] - Type of encoding. Defaults to inferring.compression
Optional[str] - Type of compression. Defaults to inferring.delimiter
Optional[str] - Delimiter for values in csv rows. Defaults to inferring.skip_initial_space
bool - Ignore whitespace straight after delimiter. Defaults to False.sheet
Optional[Union[int, str] - Sheet in Excel. Defaults to inferring.fill_merged_cells
bool - Whether to fill merged cells. Defaults to True.http_session
Session - Session object to use. Defaults to downloader session.columns
Union[ListTuple[int], ListTuple[str], None] - Columns to pick. Defaults to all.default_type
Optional[str] - Default field type if infer_types False. Defaults to string.float_numbers
bool - Use float not Decimal if infer_types True. Defaults to True.null_values
List[Any] - Values that will return None. Defaults to [""]dialect
Dialect - This can be set to override the above. See Frictionless docs.detector
Detector - This can be set to override the above. See Frictionless docs.layout
Layout - This can be set to override the above. See Frictionless docs.schema
Schema - This can be set to override the above. See Frictionless docs.
Returns:
Dict
- Dictionary keys (first column) and values (second column)
download_tabular_rows_as_dicts
def download_tabular_rows_as_dicts(
url: Union[str, ListTuple[str]],
has_hxl: bool = False,
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
keycolumn: int = 1,
ignore_blank_rows: bool = True,
infer_types: bool = False,
header_insertions: Optional[ListTuple[Tuple[int, str]]] = None,
row_function: Optional[Callable[[List[str], ListDict],
ListDict]] = None,
**kwargs: Any) -> Dict[str, Dict]
Download multicolumn csv from url and return dictionary where keys are first column and values are dictionaries with keys from column headers and values from columns beneath.
When a list of urls is supplied (in url), then the has_hxl flag indicates if the files are HXLated so that the HXL row is only included from the first file. The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined as a list of strings. It defaults to 1 and cannot be None.
Optionally, headers can be inserted at specific positions. This is achieved using the header_insertions argument. If supplied, it is a list of tuples of the form (position, header) to be inserted. A function is called for each row. If supplied, it takes as arguments: headers (prior to any insertions) and row (which will be in dict or list form depending upon the dict_rows argument) and outputs a modified row or None to ignore the row.
Arguments:
url
Union[str, ListTuple[str]] - A single or list of URLs or paths to read fromhas_hxl
bool - Whether files have HXL hashtags. Ignored for single url. Defaults to False.headers
Union[int, ListTuple[int], ListTuple[str]] - Number of row(s) containing headers or list of headers. Defaults to 1.keycolumn
int - Number of column to be used for key. Defaults to 1.ignore_blank_rows
bool - Whether to ignore blank rows. Defaults to True.infer_types
bool - Whether to infer types. Defaults to False (strings).header_insertions
Optional[ListTuple[Tuple[int,str]]] - List of (position, header) to insert. Defaults to None.row_function
Optional[Callable[[List[str],ListDict],ListDict]] - Function to call for each row. Defaults to None. **kwargs:format
Optional[str] - Type of file. Defaults to inferring.file_type
Optional[str] - Type of file. Defaults to inferring.encoding
Optional[str] - Type of encoding. Defaults to inferring.compression
Optional[str] - Type of compression. Defaults to inferring.delimiter
Optional[str] - Delimiter for values in csv rows. Defaults to inferring.skip_initial_space
bool - Ignore whitespace straight after delimiter. Defaults to False.sheet
Optional[Union[int, str] - Sheet in Excel. Defaults to inferring.fill_merged_cells
bool - Whether to fill merged cells. Defaults to True.http_session
Session - Session object to use. Defaults to downloader session.columns
Union[ListTuple[int], ListTuple[str], None] - Columns to pick. Defaults to all.default_type
Optional[str] - Default field type if infer_types False. Defaults to string.float_numbers
bool - Use float not Decimal if infer_types True. Defaults to True.null_values
List[Any] - Values that will return None. Defaults to [""].dialect
Dialect - This can be set to override the above. See Frictionless docs.detector
Detector - This can be set to override the above. See Frictionless docs.layout
Layout - This can be set to override the above. See Frictionless docs.schema
Schema - This can be set to override the above. See Frictionless docs.
Returns:
Dict[str,Dict]
- Dictionary where keys are first column and values are dictionaries with keys from column headers and values from columns beneath
download_tabular_cols_as_dicts
def download_tabular_cols_as_dicts(
url: Union[str, ListTuple[str]],
has_hxl: bool = False,
headers: Union[int, ListTuple[int], ListTuple[str]] = 1,
keycolumn: int = 1,
ignore_blank_rows: bool = True,
infer_types: bool = False,
header_insertions: Optional[ListTuple[Tuple[int, str]]] = None,
row_function: Optional[Callable[[List[str], ListDict],
ListDict]] = None,
**kwargs: Any) -> Dict[str, Dict]
Download multicolumn csv from url and return dictionary where keys are header names and values are dictionaries with keys from first column and values from other columns.
When a list of urls is supplied (in url), then the has_hxl flag indicates if the files are HXLated so that the HXL row is only included from the first file. The headers argument is either a row number or list of row numbers (in case of multi-line headers) to be considered as headers (rows start counting at 1), or the actual headers defined as a list of strings. It defaults to 1 and cannot be None.
Optionally, headers can be inserted at specific positions. This is achieved using the header_insertions argument. If supplied, it is a list of tuples of the form (position, header) to be inserted. A function is called for each row. If supplied, it takes as arguments: headers (prior to any insertions) and row (which will be in dict or list form depending upon the dict_rows argument) and outputs a modified row or None to ignore the row.
Arguments:
url
Union[str, ListTuple[str]] - A single or list of URLs or paths to read fromhas_hxl
bool - Whether files have HXL hashtags. Ignored for single url. Defaults to False.headers
Union[int, ListTuple[int], ListTuple[str]] - Number of row(s) containing headers or list of headers. Defaults to 1.keycolumn
int - Number of column to be used for key. Defaults to 1.ignore_blank_rows
bool - Whether to ignore blank rows. Defaults to True.infer_types
bool - Whether to infer types. Defaults to False (strings).header_insertions
Optional[ListTuple[Tuple[int,str]]] - List of (position, header) to insert. Defaults to None.row_function
Optional[Callable[[List[str],ListDict],ListDict]] - Function to call for each row. Defaults to None. **kwargs:format
Optional[str] - Type of file. Defaults to inferring.file_type
Optional[str] - Type of file. Defaults to inferring.encoding
Optional[str] - Type of encoding. Defaults to inferring.compression
Optional[str] - Type of compression. Defaults to inferring.delimiter
Optional[str] - Delimiter for values in csv rows. Defaults to inferring.skip_initial_space
bool - Ignore whitespace straight after delimiter. Defaults to False.sheet
Optional[Union[int, str] - Sheet in Excel. Defaults to inferring.fill_merged_cells
bool - Whether to fill merged cells. Defaults to True.http_session
Session - Session object to use. Defaults to downloader session.columns
Union[ListTuple[int], ListTuple[str], None] - Columns to pick. Defaults to all.default_type
Optional[str] - Default field type if infer_types False. Defaults to string.float_numbers
bool - Use float not Decimal if infer_types True. Defaults to True.null_values
List[Any] - Values that will return None. Defaults to [""].dialect
Dialect - This can be set to override the above. See Frictionless docs.detector
Detector - This can be set to override the above. See Frictionless docs.layout
Layout - This can be set to override the above. See Frictionless docs.schema
Schema - This can be set to override the above. See Frictionless docs.
Returns:
Dict[str,Dict]
- Dictionary where keys are header names and values are dictionaries with keys from first column and values from other columns
get_column_positions
@staticmethod
def get_column_positions(headers: ListTuple[str]) -> Dict[str, int]
Get mapping of headers to column positions.
Arguments:
headers
ListTuple[str] - List of headers
Returns:
Dict[str,int]
- Dictionary where keys are header names and values are header positions
generate_downloaders
@classmethod
def generate_downloaders(cls,
custom_configs: Dict[str, Dict],
user_agent: Optional[str] = None,
user_agent_config_yaml: Optional[str] = None,
user_agent_lookup: Optional[str] = None,
use_env: bool = True,
fail_on_missing_file: bool = True,
rate_limit: Optional[Dict] = None,
**kwargs: Any) -> None
Generate downloaders. Requires either global user agent to be set or appropriate user agent parameter(s) to be completed. The custom_configs dictionary is a mapping from name to a dictionary of custom configuration parameters that is added to the underlying session's params or headers. It can have keys that correspond to the input arguments of Download's constructor init (or the other arguments of this method).
Arguments:
custom_configs
Dict[str, Dict] - Optional dictionary of custom configurations.user_agent
Optional[str] - User agent string. HDXPythonUtilities/X.X.X- is prefixed.user_agent_config_yaml
Optional[str] - Path to YAML user agent configuration. Ignored if user_agent supplied. Defaults to ~/.useragent.yaml.user_agent_lookup
Optional[str] - Lookup key for YAML. Ignored if user_agent supplied.use_env
bool - Whether to read environment variables. Defaults to True.fail_on_missing_file
bool - Raise an exception if any specified configuration files are missing. Defaults to True.rate_limit
Optional[Dict] - Rate limiting per host eg. {"calls": 1, "period": 0.1}. Defaults to None.**kwargs
- See belowauth
Tuple[str, str] - Authorisation information in tuple form (user, pass) ORbasic_auth
str - Authorisation information in basic auth string form (Basic xxxxxxxxxxxxxxxx) ORbasic_auth_file
str - Path to file containing authorisation information in basic auth string form (Basic xxxxxxxxxxxxxxxx)bearer_token
str - Bearer token string ORbearer_token_file
str - Path to file containing bearer token string ORextra_params_dict
Dict[str, str] - Extra parameters to put on end of url as a dictionary ORextra_params_json
str - Path to JSON file containing extra parameters to put on end of url ORextra_params_yaml
str - Path to YAML file containing extra parameters to put on end of urlextra_params_lookup
str - Lookup key for parameters. If not given assumes parameters are at root of the dict.headers
Dict - Additional headers to add to request.use_auth
str - If more than one auth found, specify which one to use, rather than failing.status_forcelist
ListTuple[int] - HTTP statuses for which to force retry. Defaults to (429, 500, 502, 503, 504).allowed_methods
ListTuple[str] - HTTP methods for which to force retry. Defaults to ("HEAD", "TRACE", "GET", "PUT", "OPTIONS", "DELETE").
Returns:
None
get_downloader
@classmethod
def get_downloader(cls, name: Optional[str] = None) -> "Download"
Get a generated downloader given a name. If name is not supplied, the default one will be returned.
Arguments:
name
Optional[str] - Name of downloader. Defaults to None (get default).
Returns:
Download
- Downloader object