Last active
October 14, 2025 10:54
-
-
Save Wauplin/a7a385db01ddbf0067325ec02bc35ce0 to your computer and use it in GitHub Desktop.
Parse bytes and durations units (string => int)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import pytest | |
| RE_NUMBER_WITH_UNIT = re.compile(r"(\d+)([a-z]+)", re.IGNORECASE) | |
| BYTE_UNITS = { | |
| "k": 1_000, | |
| "m": 1_000_000, | |
| "g": 1_000_000_000, | |
| "t": 1_000_000_000_000, | |
| "p": 1_000_000_000_000_000, | |
| } | |
| TIME_UNITS = { | |
| "s": 1, | |
| "m": 60, | |
| "h": 60 * 60, | |
| "d": 24 * 60 * 60, | |
| "w": 7 * 24 * 60 * 60, | |
| "mo": 30 * 24 * 60 * 60, | |
| "y": 365 * 24 * 60 * 60, | |
| } | |
| def parse_size(value: str) -> int: | |
| """Parse a size string into bytes. | |
| Examples: | |
| "10" -> 10 | |
| "10k" -> 10_000 | |
| "5M" -> 5_000_000 | |
| "2G" -> 2_000_000_000 | |
| "1T" -> 1_000_000_000_000 | |
| Not supported: | |
| "1.5G" -> ValueError | |
| "3KB" -> ValueError | |
| "-5M" -> ValueError | |
| """ | |
| return _parse_with_unit(value, BYTE_UNITS) | |
| def parse_duration(value: str) -> int: | |
| """Parse a duration string into seconds. | |
| Examples: | |
| "10s" -> 10 | |
| "5m" -> 300 | |
| "2h" -> 7200 | |
| "1d" -> 86400 | |
| Not supported: | |
| "1.5h" -> ValueError | |
| "3month" -> ValueError | |
| "-5m" -> ValueError | |
| """ | |
| return _parse_with_unit(value, TIME_UNITS) | |
| def _parse_with_unit(value: str, units: dict[str, int]) -> int: | |
| """Parse a numeric value with optional unit.""" | |
| # Try plain number first | |
| try: | |
| return int(value) | |
| except ValueError: | |
| pass | |
| # Parse number with unit | |
| match = RE_NUMBER_WITH_UNIT.fullmatch(value) | |
| if not match: | |
| raise ValueError(f"Invalid value '{value}'. Must match pattern '\\d+[a-z]+' or be a plain number.") | |
| number = int(match.group(1)) | |
| unit = match.group(2).lower() | |
| if unit not in units: | |
| raise ValueError(f"Unknown unit '{unit}'. Must be one of {list(units.keys())}.") | |
| return number * units[unit] | |
| @pytest.mark.parametrize( | |
| "input,expected", | |
| [ | |
| ("10", 10), | |
| ("10k", 10_000), | |
| ("5M", 5_000_000), | |
| ("2G", 2_000_000_000), | |
| ("1T", 1_000_000_000_000), | |
| ("0", 0), | |
| ], | |
| ) | |
| def test_parse_size_valid(input, expected): | |
| assert parse_size(input) == expected | |
| @pytest.mark.parametrize( | |
| "input", | |
| [ | |
| "1.5G", | |
| "3KB", | |
| "-5M", | |
| "10X", | |
| "abc", | |
| "", | |
| "123abc456", | |
| " 10 K", | |
| ], | |
| ) | |
| def test_parse_size_invalid(input): | |
| with pytest.raises(ValueError): | |
| parse_size(input) | |
| @pytest.mark.parametrize( | |
| "input,expected", | |
| [ | |
| ("10s", 10), | |
| ("5m", 300), | |
| ("2h", 7200), | |
| ("1d", 86400), | |
| ("1w", 604800), | |
| ("1mo", 2592000), | |
| ("1y", 31536000), | |
| ("0", 0), | |
| ], | |
| ) | |
| def test_parse_duration_valid(input, expected): | |
| assert parse_duration(input) == expected | |
| @pytest.mark.parametrize( | |
| "input", | |
| [ | |
| "1.5h", | |
| "3month", | |
| "-5m", | |
| "10X", | |
| "abc", | |
| "", | |
| "123abc456", | |
| " 10 m", | |
| ], | |
| ) | |
| def test_parse_duration_invalid(input): | |
| with pytest.raises(ValueError): | |
| parse_duration(input) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
As part of huggingface/huggingface_hub#3439 review. Minimal parsing logic handling only basic cases.
Tests have been auto-generated by copilot + inference providers.