Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a plugin for EmailAddress #694

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ If you love `detect-secrets`, please star our project on GitHub to show your sup

<!--
### Unreleased
##### May 16th, 2023
#### :tada: New Features
- Added a detector for EmailAddress ([#694])
[#694]: https://github.com/Yelp/detect-secrets/pull/694
-->

### v1.4.0
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ AzureStorageKeyDetector
BasicAuthDetector
CloudantDetector
DiscordBotTokenDetector
EmailAddressDetector
GitHubTokenDetector
Base64HighEntropyString
HexHighEntropyString
Expand Down
61 changes: 61 additions & 0 deletions detect_secrets/plugins/email_address.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import re

from .base import RegexBasedDetector


class EmailAddressDetector(RegexBasedDetector):
"""
A detector for identifying email addresses within text. It uses regular expressions to
focus on general email structures, not strictly adhering to standards like RFC 5322.
Designed for efficient and broad detection, it also has some limitations.

Features:
- Detects a wide range of email formats efficiently.
- Ignores common, non-critical emails to minimize false positives.

Limitations:
- May miss edge cases or unconventional email formats.
- Not compliant with advanced formats, e.g., RFC 6530 non-Latin emails.

Regular Expression:
Utilizes a regex pattern focusing on typical email components: local part, domain, TLD.
Excludes predefined whitelist emails to reduce false positives.

References:
- https://en.wikipedia.org/wiki/Email_address
- https://stackoverflow.com/a/14321045
"""
secret_type = 'Email Address'

# Excluses whitelist email addresses from detection to reduce false positives.
whitelist = ['noreply@github.com', 'git@github.com']

base_pattern = r"""
[\w+-]+ # Local part before the @ symbol
(?:\.[\w+-]+)* # Optional dot-separated words in the local part
@ # The @ symbol
[\w+-]+ # Domain part after the @ symbol
(?:\.[\w+-]+)* # Optional dot-separated words in the domain part
(?:\.[a-zA-Z]{2,4}) # TLD part
"""
# Pattern Breakdown:
# 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, -
# Represents the local part of the email address before the @ symbol.
# 2. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot)
# Allows for dot-separated words in the local part of the email address.
# 3. @: Matches the @ symbol.
# 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, -
# Represents the domain part of the email address after the @ symbol.
# 5. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot)
# Allows for dot-separated words in the domain part of the email address.
# 6. (?:\.[a-zA-Z]{2,4}): Matches 2 to 4 instances of a-z, A-Z, starting with a . (dot)
# Represents the TLD (top-level domain) part of the email address.

deny_pattern = r'(?!' \
+ '|'.join(re.escape(email) for email in whitelist) \
+ r'$)' + base_pattern
# Combines the base pattern with a negative lookahead to exclude whitelist email addresses.

denylist = [
re.compile(r'\b' + deny_pattern + r'\b', flags=re.VERBOSE),
]
112 changes: 112 additions & 0 deletions tests/plugins/email_address_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import pytest

from detect_secrets.plugins.email_address import EmailAddressDetector


class TestEmailAddressDetector:
"""
Testing strategy

Cover the cartesian product of these partitions:

1. Partition on email address format:
a. Valid email addresses
b. Invalid email addresses

2. Partition on line content:
a. email address is the only content
b. email address is part of a larger string

And cover these cases:

1. Partition on whitelist email addresses:
a. email address is in the whitelist
b. email address is not in the whitelist
"""

@pytest.mark.parametrize(
'payload, should_flag',
[
# Valid email addresses, only content
('user@example.com', True),
('user.name@example.com', True),
('user_name@example.com', True),
('user-name@example.com', True),
('user+name@example.com', True),
('user@ex_ample.com', True),
('user@-example.com', True),
('user@example-.com', True),
('user.name+category@example.com', True),
# Valid email addresses, part of larger string
('This is an email address: user@example.com', True),
('user@example.com is a valid email address', True),
# Invalid email addresses
('user@com', False),
('@example.com', False),
('user@.com', False),
('user@ex..com', False),
# Whitelist email addresses
('noreply@github.com', False),
('git@github.com', False),
# Non-whitelist email addresses
('user@gmail.com', True),
('user@yahoo.com', True),
('user@hotmail.com', True),

# Additional test cases

# Valid email addresses with different domain extensions
('user@domain.co.uk', True),
('user@domain.io', True),
('user@domain.org', True),
('user@sub.domain.com', True),

# Valid email addresses with numbers
('user123@example.com', True),
('123user@example.com', True),
('user123@123example.com', True),

# Valid email addresses, part of larger text with special characters
('Contact us at: user@example.com!', True),
('Email: user@example.com for more info.', True),

# Invalid email addresses with missing components
('user@example', False),
('user@.example.com', False),
('@example.com', False),
('user@', False),

# Invalid email addresses with special characters
('user@exa*mple.com', False),
('user@examp!e.com', False),
('user@exampl$.com', False),
('user@exam^ple.com', False),

# Unusual formats, mark as false
('"user"@example.com', False), # Quoted local part
('user@[123.123.123.123]', False), # IP address domain

# Invalid email addresses, incorrect use of special characters
('user@exa,mple.com', False),
('user@examp<le.com', False),
('user@exampl>com', False),
('user@exampl;e.com', False),

# Edge cases - rare but valid email formats
('user+mailbox/department=shipping@example.com', True),
('customer/department=shipping@example.com', True),
('!def!xyz%abc@example.com', True),
('_Yosemite.Sam@example.com', True),

# Edge cases - position of . (dot)
('user@example..com', False), # Double dot in domain
('.user@example.com', True), # Leading dot in local part
('user@.example.com', False), # Leading dot in domain
('user@example.com.', True), # Trailing dot in domain
],
)
def test_analyze_line(self, payload, should_flag):
logic = EmailAddressDetector()

output = logic.analyze_line(filename='mock_filename', line=payload)
assert len(output) == int(should_flag)
Loading