这里分割段落不区分中英文标点,你可以根据需求改
分割后标点跟随句子后面
def split_sentences_keep_delimiter(text):
pattern = r'[^。!!??::;;,,]+[。!!??::;;,,]'
sentences = re.findall(pattern, text)
last_sentence = re.sub(r'[。!!??::;;;,,]', '', text)
if last_sentence and not re.search(pattern, last_sentence):
sentences.append(last_sentence.strip())
return sentences[:len(sentences)-1]
分割后去掉标点只保留文本
import re
def split_text_with_punctuation(text):
split_sentences = re.split(r'[。.!!??::;;,,]', text)
return split_sentences
text = "你好,世界!这是个测试。看看是否有效?当然,它会的。"
print(split_text_with_punctuation(text))
分割后标点和文本分开
import re
def split_text_with_punctuation(text):
split_sentences = re.split(r'([。.!!??::;;,,])', text)
return split_sentences
text = "你好,世界!这是个测试。看看是否有效?当然,它会的。"
print(split_text_with_punctuation(text))