Skip to content

Instantly share code, notes, and snippets.

@Wauplin
Last active October 14, 2025 10:54
Show Gist options
  • Select an option

  • Save Wauplin/a7a385db01ddbf0067325ec02bc35ce0 to your computer and use it in GitHub Desktop.

Select an option

Save Wauplin/a7a385db01ddbf0067325ec02bc35ce0 to your computer and use it in GitHub Desktop.
Parse bytes and durations units (string => int)
import re
import pytest
RE_NUMBER_WITH_UNIT = re.compile(r"(\d+)([a-z]+)", re.IGNORECASE)
BYTE_UNITS = {
"k": 1_000,
"m": 1_000_000,
"g": 1_000_000_000,
"t": 1_000_000_000_000,
"p": 1_000_000_000_000_000,
}
TIME_UNITS = {
"s": 1,
"m": 60,
"h": 60 * 60,
"d": 24 * 60 * 60,
"w": 7 * 24 * 60 * 60,
"mo": 30 * 24 * 60 * 60,
"y": 365 * 24 * 60 * 60,
}
def parse_size(value: str) -> int:
"""Parse a size string into bytes.
Examples:
"10" -> 10
"10k" -> 10_000
"5M" -> 5_000_000
"2G" -> 2_000_000_000
"1T" -> 1_000_000_000_000
Not supported:
"1.5G" -> ValueError
"3KB" -> ValueError
"-5M" -> ValueError
"""
return _parse_with_unit(value, BYTE_UNITS)
def parse_duration(value: str) -> int:
"""Parse a duration string into seconds.
Examples:
"10s" -> 10
"5m" -> 300
"2h" -> 7200
"1d" -> 86400
Not supported:
"1.5h" -> ValueError
"3month" -> ValueError
"-5m" -> ValueError
"""
return _parse_with_unit(value, TIME_UNITS)
def _parse_with_unit(value: str, units: dict[str, int]) -> int:
"""Parse a numeric value with optional unit."""
# Try plain number first
try:
return int(value)
except ValueError:
pass
# Parse number with unit
match = RE_NUMBER_WITH_UNIT.fullmatch(value)
if not match:
raise ValueError(f"Invalid value '{value}'. Must match pattern '\\d+[a-z]+' or be a plain number.")
number = int(match.group(1))
unit = match.group(2).lower()
if unit not in units:
raise ValueError(f"Unknown unit '{unit}'. Must be one of {list(units.keys())}.")
return number * units[unit]
@pytest.mark.parametrize(
"input,expected",
[
("10", 10),
("10k", 10_000),
("5M", 5_000_000),
("2G", 2_000_000_000),
("1T", 1_000_000_000_000),
("0", 0),
],
)
def test_parse_size_valid(input, expected):
assert parse_size(input) == expected
@pytest.mark.parametrize(
"input",
[
"1.5G",
"3KB",
"-5M",
"10X",
"abc",
"",
"123abc456",
" 10 K",
],
)
def test_parse_size_invalid(input):
with pytest.raises(ValueError):
parse_size(input)
@pytest.mark.parametrize(
"input,expected",
[
("10s", 10),
("5m", 300),
("2h", 7200),
("1d", 86400),
("1w", 604800),
("1mo", 2592000),
("1y", 31536000),
("0", 0),
],
)
def test_parse_duration_valid(input, expected):
assert parse_duration(input) == expected
@pytest.mark.parametrize(
"input",
[
"1.5h",
"3month",
"-5m",
"10X",
"abc",
"",
"123abc456",
" 10 m",
],
)
def test_parse_duration_invalid(input):
with pytest.raises(ValueError):
parse_duration(input)
@Wauplin
Copy link
Author

Wauplin commented Oct 14, 2025

As part of huggingface/huggingface_hub#3439 review. Minimal parsing logic handling only basic cases.

Tests have been auto-generated by copilot + inference providers.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment