Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 283 additions & 0 deletions manual_benchmarks/test_datatypes_crunch.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "test_datatypes_crunch.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "CqyExbrNYKcv"
},
"source": [
"### Import stuff"
]
},
{
"cell_type": "code",
"metadata": {
"id": "wF4Ga_unYJ8F"
},
"source": [
"!pip install radis\n",
"\n",
"from radis.db.classes import get_molecule_identifier\n",
"from radis.levels.partfunc import PartFuncHAPI\n",
"\n",
"from radis.io.hitemp import fetch_hitemp\n",
"from radis.db.classes import get_molecule\n",
"from radis.phys.constants import hc_k\n",
"\n",
"import numpy as np\n",
"from numpy import exp, pi"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "jhglpoCFXrpp"
},
"source": [
"### Function to reduce the memory usage of pandas dataframe"
]
},
{
"cell_type": "code",
"metadata": {
"id": "CryqdLGaXdFO"
},
"source": [
"def reduce_mem_usage(props):\n",
" start_mem_usg = props.memory_usage().sum() / 1024**2 \n",
" print(\"Memory usage of properties dataframe is :\",start_mem_usg,\" MB\")\n",
" NAlist = [] # Keeps track of columns that have missing values filled in. \n",
" for col in props.columns:\n",
" if props[col].dtype != object: # Exclude strings\n",
" \n",
" # Print current column type\n",
" print(\"******************************\")\n",
" print(\"Column: \",col)\n",
" print(\"dtype before: \",props[col].dtype)\n",
" \n",
" # make variables for Int, max and min\n",
" IsInt = False\n",
" mx = props[col].max()\n",
" mn = props[col].min()\n",
" \n",
" # Integer does not support NA, therefore, NA needs to be filled\n",
" if not np.isfinite(props[col]).all(): \n",
" NAlist.append(col)\n",
" props[col].fillna(-1,inplace=True) \n",
" \n",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Erwan that we should do this conversion at an earlier step, namely when loading the database. When that is implemented, at this point you can assume all data is indeed finite.

" # test if column can be converted to an integer\n",
" asint = props[col].fillna(0).astype(np.int64)\n",
" result = (props[col] - asint)\n",
" result = result.sum()\n",
" if result > -0.01 and result < 0.01:\n",
" IsInt = True\n",
"\n",
" \n",
" # Make Integer/unsigned Integer datatypes\n",
" if IsInt:\n",
" if mn >= 0:\n",
" if mx < 255:\n",
" props[col] = props[col].astype(np.uint8)\n",
" elif mx < 65535:\n",
" props[col] = props[col].astype(np.uint16)\n",
" elif mx < 4294967295:\n",
" props[col] = props[col].astype(np.uint32)\n",
" else:\n",
" props[col] = props[col].astype(np.uint64)\n",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For readability, I would write the limits as (1 << 8) - 1, (1 << 16) - 1, (1 << 32) -1, etc.
I believe that the -1 part is not even necessary, since it's integers and you're using "Lesser than" (without the "or equal to"), so you could use (1<<8), (1<<16), and (1<<32) as limits.

a << b means "bitshift integer a by b positions to the left", this means that 1 << b is shorthand for 2**b.

" else:\n",
" if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:\n",
" props[col] = props[col].astype(np.int8)\n",
" elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:\n",
" props[col] = props[col].astype(np.int16)\n",
" elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:\n",
" props[col] = props[col].astype(np.int32)\n",
" elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:\n",
" props[col] = props[col].astype(np.int64) \n",
" \n",
" # Make float datatypes 32 bit\n",
" else:\n",
" props[col] = props[col].astype(np.float32)\n",
" \n",
" # Print new column type\n",
" print(\"dtype after: \",props[col].dtype)\n",
" print(\"******************************\")\n",
" \n",
Copy link
Member

@dcmvdbekerom dcmvdbekerom Jul 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As Hajime Kawahara recently pointed out, conversion to float32 may be too restrictive for the line position (v0) https://radis-radiation.slack.com/archives/C01G3117DJS/p1625716717008500?thread_ts=1625633254.001800&cid=C01G3117DJS. For all other decimals float32 is probably fine.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, cool function : np.finfo . Shows the relative accuracy, it may be used to check if our dtype reduction are valid? (as we know the number of significant digits used in the HITRAN database)
https://stackoverflow.com/questions/39134956/accuracy-of-float32

" # Print final result\n",
" print(\"___MEMORY USAGE AFTER COMPLETION:___\")\n",
" mem_usg = props.memory_usage().sum() / 1024**2 \n",
" print(\"Memory usage is: \",mem_usg,\" MB\")\n",
" print(\"This is \",100*mem_usg/start_mem_usg,\"% of the initial size\")\n",
" return props, NAlist"
],
"execution_count": 9,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "JIrnVcoSX1wo"
},
"source": [
"### Fetch Databank"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9SCd3A3qXx1h",
"outputId": "2feaaa4d-a152-4ac2-ce53-fc74634a5885"
},
"source": [
"df0 = fetch_hitemp(molecule='CH4', databank_name='HITEMP-CH4', isotope='1, 2, 3', load_wavenum_min=2000, load_wavenum_max=4000, local_databases='~/')"
],
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": [
"Using existing database HITEMP-CH4\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kfIfc1fSYoDd"
},
"source": [
"### Let's Crunch"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oAgwp-vwYIH6",
"outputId": "22309e27-3fda-4dc4-96e3-b1094f159e4c"
},
"source": [
"df1, NAlist = reduce_mem_usage(df0)\n",
"print(\"_________________\")\n",
"print(\"\")\n",
"print(\"Warning: the following columns have missing values filled with -1': \")\n",
"print(\"_________________\")\n",
"print(\"\")\n",
"print(NAlist)"
],
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"text": [
"Memory usage of properties dataframe is : 565.2839660644531 MB\n",
"******************************\n",
"Column: id\n",
"dtype before: int64\n",
"dtype after: uint8\n",
"******************************\n",
"******************************\n",
"Column: iso\n",
"dtype before: int64\n",
"dtype after: uint8\n",
"******************************\n",
"******************************\n",
"Column: wav\n",
"dtype before: float64\n",
"dtype after: float32\n",
"******************************\n",
"******************************\n",
"Column: int\n",
"dtype before: float64\n",
"dtype after: uint8\n",
"******************************\n",
"******************************\n",
"Column: A\n",
"dtype before: float64\n",
"dtype after: float32\n",
"******************************\n",
"******************************\n",
"Column: airbrd\n",
"dtype before: float64\n",
"dtype after: float32\n",
"******************************\n",
"******************************\n",
"Column: selbrd\n",
"dtype before: float64\n",
"dtype after: float32\n",
"******************************\n",
"******************************\n",
"Column: El\n",
"dtype before: float64\n",
"dtype after: float32\n",
"******************************\n",
"******************************\n",
"Column: Tdpair\n",
"dtype before: float64\n",
"dtype after: float32\n",
"******************************\n",
"******************************\n",
"Column: Pshft\n",
"dtype before: float64\n",
"dtype after: float32\n",
"******************************\n",
"******************************\n",
"Column: gp\n",
"dtype before: float64\n",
"dtype after: uint16\n",
"******************************\n",
"******************************\n",
"Column: gpp\n",
"dtype before: float64\n",
"dtype after: uint16\n",
"******************************\n",
"___MEMORY USAGE AFTER COMPLETION:___\n",
"Memory usage is: 349.76945400238037 MB\n",
"This is 61.875 % of the initial size\n",
"_________________\n",
"\n",
"Warning: the following columns have missing values filled with -1': \n",
"_________________\n",
"\n",
"[]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "4vUB-2bbcLoT"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}